From 6c4c12c950362d283ebe636d6f8571e551146295 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 1 Dec 2017 18:54:21 -0500 Subject: [PATCH 001/184] [src] Bug-fix to conceptual bug in Minimum Bayes Risk/sausage code. Thanks:@jtrmal --- src/lat/sausages.cc | 8 ++++---- src/lat/sausages.h | 24 ++++++++++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc index 7cb7a273b98..16a61b3f5eb 100644 --- a/src/lat/sausages.cc +++ b/src/lat/sausages.cc @@ -114,11 +114,11 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q, for (int32 q = 0; q <= Q; q++) { if (q == 0) { alpha_dash_arc(q) = // line 15. - alpha_dash(s_a, q) + l(w_a, 0) + delta(); + alpha_dash(s_a, q) + l(w_a, 0, true); } else { // a1,a2,a3 are the 3 parts of min expression of line 17. int32 r_q = r(q); double a1 = alpha_dash(s_a, q-1) + l(w_a, r_q), - a2 = alpha_dash(s_a, q) + l(w_a, 0) + delta(), + a2 = alpha_dash(s_a, q) + l(w_a, 0, true), a3 = alpha_dash_arc(q-1) + l(0, r_q); alpha_dash_arc(q) = std::min(a1, std::min(a2, a3)); } @@ -166,11 +166,11 @@ void MinimumBayesRisk::AccStats() { const Arc &arc = arcs_[pre_[n][i]]; int32 s_a = arc.start_node, w_a = arc.word; BaseFloat p_a = arc.loglike; - alpha_dash_arc(0) = alpha_dash(s_a, 0) + l(w_a, 0) + delta(); // line 14. + alpha_dash_arc(0) = alpha_dash(s_a, 0) + l(w_a, 0, true); // line 14. for (int32 q = 1; q <= Q; q++) { // this loop == lines 15-18. int32 r_q = r(q); double a1 = alpha_dash(s_a, q-1) + l(w_a, r_q), - a2 = alpha_dash(s_a, q) + l(w_a, 0) + delta(), + a2 = alpha_dash(s_a, q) + l(w_a, 0, true), a3 = alpha_dash_arc(q-1) + l(0, r_q); if (a1 <= a2) { if (a1 <= a3) { b_arc[q] = 1; alpha_dash_arc(q) = a1; } diff --git a/src/lat/sausages.h b/src/lat/sausages.h index a6af91cc12f..9dab0b68713 100644 --- a/src/lat/sausages.h +++ b/src/lat/sausages.h @@ -128,8 +128,18 @@ class MinimumBayesRisk { /// Minimum-Bayes-Risk Decode. Top-level algorithm. Figure 6 of the paper. void MbrDecode(); - /// The basic edit-distance function l(a,b), as in the paper. - inline double l(int32 a, int32 b) { return (a == b ? 0.0 : 1.0); } + /// Without the 'penalize' argument this gives us the basic edit-distance + /// function l(a,b), as in the paper. + /// With the 'penalize' argument it can be interpreted as the edit distance + /// plus the 'delta' from the paper, except that we make a kind of conceptual + /// bug-fix and only apply the delta if the edit-distance was not already + /// zero. This bug-fix was necessary in order to force all the stats to show + /// up, that should show up, and applying the bug-fix makes the sausage stats + /// significantly less sparse. + inline double l(int32 a, int32 b, bool penalize = false) { + if (a == b) return 0.0; + else return (penalize ? 1.0 + delta() : 1.0); + } /// returns r_q, in one-based indexing, as in the paper. inline int32 r(int32 q) { return R_[q-1]; } @@ -151,8 +161,14 @@ class MinimumBayesRisk { // epsilon (0). (But if no words in vec, just one epsilon) static void NormalizeEps(std::vector *vec); - static inline BaseFloat delta() { return 1.0e-05; } // A constant - // used in the algorithm. + // delta() is a constant used in the algorithm, which penalizes + // the use of certain epsilon transitions in the edit-distance which would cause + // words not to show up in the accumulated edit-distance statistics. + // There has been a conceptual bug-fix versus the way it was presented in + // the paper: we now add delta only if the edit-distance was not already + // zero. + static inline BaseFloat delta() { return 1.0e-05; } + /// Function used to increment map. static inline void AddToMap(int32 i, double d, std::map *gamma) { From 38e63553519a901e391419534c31d189300bb8de Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 2 Dec 2017 15:23:32 -0500 Subject: [PATCH 002/184] [scripts] Support batchnorm after LSTM layers. --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 84 ++++++++++++++----- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 4 + 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 9743d0100b9..96f63537a55 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -103,7 +103,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -113,7 +113,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -258,6 +258,8 @@ def generate_lstm_config(self): # This class is for lines like # 'lstmp-layer name=lstm1 input=[-1] delay=-3' +# (you can also use the name 'lstmp-batchnorm-layer' if you want it to be followed +# by batchnorm). # It generates an LSTM sub-graph with output projections. It can also generate # outputs without projection, but you could use the XconfigLstmLayer for this # simple LSTM. @@ -292,7 +294,9 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigLstmpLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "lstmp-layer" + # lstmp-batchnorm-layer is like lstmp-layer but followed by a batchnorm + # component. + assert first_token in ["lstmp-layer", "lstmp-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -353,7 +357,8 @@ def auxiliary_outputs(self): return ['c_t'] def output_name(self, auxiliary_output = None): - node_name = 'rp_t' + node_name = ( 'rp_t_batchnorm' if self.layer_type == 'lstmp-batchnorm-layer' + else 'rp_t' ) if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output @@ -375,7 +380,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -385,7 +390,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -542,18 +547,27 @@ def generate_lstm_config(self): configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}" "".format(name, rec_proj_dim, bptrunc_str)) - configs.append("# r_t and p_t : rp_t will be the output") + configs.append("# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)") configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t" "".format(name)) configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 " "dim={1}".format(name, rec_proj_dim)) configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + if self.layer_type == "lstmp-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format( + name, rec_proj_dim + nonrec_proj_dim)) + configs.append("component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm " + "input={0}.rp_t".format(name)) + return configs # This class is for lines like # 'fast-lstm-layer name=lstm1 input=[-1] delay=-3' +# (you can also use the name 'fast-lstm-batchnorm-layer' if you want it to be followed +# by batchnorm). # It generates an LSTM sub-graph without output projections. # Unlike 'lstm-layer', the core nonlinearities of the LSTM are done in a special-purpose # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined @@ -586,7 +600,7 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "fast-lstm-layer" + assert first_token in ["fast-lstm-layer", "fast-lstm-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -626,7 +640,8 @@ def auxiliary_outputs(self): return ['c'] def output_name(self, auxiliary_output = None): - node_name = 'm' + node_name = ('m_batchnorm' if self.layer_type == 'fast-lstm-batchnorm-layer' + else 'm') if auxiliary_output is not None: if auxiliary_output == 'c': node_name = 'c' @@ -647,7 +662,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -657,7 +672,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -723,7 +738,13 @@ def generate_lstm_config(self): configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) - # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) + + if self.layer_type == "fast-lstm-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim)) + configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) return configs @@ -731,6 +752,8 @@ def generate_lstm_config(self): # This class is for lines like # 'fast-lstmb-layer name=lstm1 input=[-1] delay=-3' +# (you can also call it 'fast-lstmb-batchnorm-layer' if you want it to end +# in a batchnorm component). # It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix # of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide # it into two matrices, with batch-norm in between to stabilize the training. @@ -763,7 +786,7 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmbLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "fast-lstmb-layer" + assert first_token in [ 'fast-lstmb-layer', 'fast-lstmb-batchnorm-layer' ] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -807,7 +830,8 @@ def auxiliary_outputs(self): return ['c'] def output_name(self, auxiliary_output = None): - node_name = 'm' + node_name = ('m_batchnorm' if self.layer_type == 'fast-lstmb-batchnorm-layer' + else 'm') if auxiliary_output is not None: if auxiliary_output == 'c': node_name = 'c' @@ -828,7 +852,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -838,7 +862,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name @@ -923,6 +947,13 @@ def generate_lstm_config(self): configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) + + if self.layer_type == "fast-lstmb-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim)) + configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) return configs @@ -933,6 +964,8 @@ def generate_lstm_config(self): # 'fast-lstmp-layer name=lstm1 input=[-1] delay=-3' # or: # 'fast-lstmp-layer name=lstm1 input=[-1] delay=-3 cell-dim=1024 recurrent-projection-dim=512 non-recurrent-projection-dim=512' +# (you can also use the name 'fast-lstmp-batchnorm-layer' if you want it to be followed +# by batchnorm). # It generates an LSTM sub-graph with output projections (i.e. a projected LSTM, AKA LSTMP). # Unlike 'lstmp-layer', the core nonlinearities of the LSTM are done in a special-purpose # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined @@ -968,7 +1001,7 @@ def generate_lstm_config(self): # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmpLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == "fast-lstmp-layer" + assert first_token in ['fast-lstmp-layer', 'fast-lstmp-batchnorm-layer'] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -1026,7 +1059,8 @@ def auxiliary_outputs(self): return ['c_t'] def output_name(self, auxiliary_output = None): - node_name = 'rp' + node_name = ('rp_batchnorm' if self.layer_type == 'fast-lstmp-batchnorm-layer' + else 'rp') if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output @@ -1048,7 +1082,7 @@ def output_dim(self, auxiliary_output = None): def get_full_config(self): ans = [] - config_lines = self.generate_lstm_config() + config_lines = self._generate_lstm_config() for line in config_lines: for config_name in ['ref', 'final']: @@ -1058,8 +1092,7 @@ def get_full_config(self): return ans # convenience function to generate the LSTM config - def generate_lstm_config(self): - + def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness @@ -1145,7 +1178,8 @@ def generate_lstm_config(self): "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin " "dim-offset={1} dim={1}".format(name, cell_dim)) - configs.append("# {0}.rp is the output node of this layer:".format(name)) + configs.append("# {0}.rp is the output node of this layer (if we're not " + "including batchnorm)".format(name)) configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name)) configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 " "dim={1}".format(name, rec_proj_dim)) @@ -1158,6 +1192,12 @@ def generate_lstm_config(self): "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) + if self.layer_type == "fast-lstmp-batchnorm-layer": + # Add the batchnorm component, if requested to include batchnorm. + configs.append("component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format( + name, rec_proj_dim + nonrec_proj_dim)) + configs.append("component-node name={0}.rp_batchnorm component={0}.rp_batchnorm " + "input={0}.rp".format(name)) configs.append("### End LSTM Layer '{0}'".format(name)) return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 99424cd535e..c41b1092da1 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -34,9 +34,13 @@ 'affine-layer' : xlayers.XconfigAffineLayer, 'lstm-layer' : xlayers.XconfigLstmLayer, 'lstmp-layer' : xlayers.XconfigLstmpLayer, + 'lstmp-batchnorm-layer' : xlayers.XconfigLstmpLayer, 'fast-lstm-layer' : xlayers.XconfigFastLstmLayer, + 'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer, 'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer, + 'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer, 'fast-lstmb-layer' : xlayers.XconfigFastLstmbLayer, + 'fast-lstmb-batchnorm-layer' : xlayers.XconfigFastLstmbLayer, 'stats-layer': xlayers.XconfigStatsLayer, 'relu-conv-layer': xlayers.XconfigConvLayer, 'conv-layer': xlayers.XconfigConvLayer, From 1a1bc73ebd5713fba3848d9e2b831e693262acd3 Mon Sep 17 00:00:00 2001 From: freewym Date: Fri, 8 Dec 2017 23:31:52 -0500 Subject: [PATCH 003/184] model combine by averaging --- src/chainbin/nnet3-chain-combine.cc | 90 ++++++++++++++++++++++----- src/nnet3bin/nnet3-combine.cc | 95 +++++++++++++++++++++++------ 2 files changed, 153 insertions(+), 32 deletions(-) diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 3c44e6b904c..9100fbe3132 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -1,6 +1,7 @@ // chainbin/nnet3-chain-combine.cc // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2017 Yiming Wang // See ../../COPYING for clarification regarding multiple authors // @@ -19,7 +20,56 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" -#include "nnet3/nnet-chain-combine.h" +#include "nnet3/nnet-utils.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-chain-diagnostics.h" + + +namespace kaldi { +namespace nnet3 { + +double ComputeObjf(const std::vector &egs, + NnetChainComputeProb *prob_computer) { + prob_computer->Reset(); + std::vector::const_iterator iter = egs.begin(), + end = egs.end(); + for (; iter != end; ++iter) + prob_computer->Compute(*iter); + const ChainObjectiveInfo *objf_info = + prob_computer->GetObjective("output"); + if (objf_info == NULL) + KALDI_ERR << "Error getting objective info (unsuitable egs?)"; + KALDI_ASSERT(objf_info->tot_weight > 0.0); + // we prefer to deal with normalized objective functions. + return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight; +} + +// Note: the object that prob_computer.nnet_ refers to should be +// *moving_average_nnet. +double UpdateNnetMovingAverageAndComputeObjf(int32 num_models, + const std::vector &egs, + const Nnet &nnet, Nnet *moving_average_nnet, + NnetChainComputeProb *prob_computer) { + int32 num_params = NumParameters(nnet); + KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet)); + Vector nnet_params(num_params, kUndefined), + moving_average_nnet_params(num_params, kUndefined); + VectorizeNnet(nnet, &nnet_params); + VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params); + moving_average_nnet_params.Scale((num_models - 1.0) / num_models); + moving_average_nnet_params.AddVec(1.0 / num_models, nnet_params); + + BaseFloat sum = moving_average_nnet_params.Sum(); + // inf/nan parameters->return -inf objective. + if (!(sum == sum && sum - sum == 0)) + return -std::numeric_limits::infinity(); + + UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet); + return ComputeObjf(egs, prob_computer); +} + +} +} int main(int argc, char *argv[]) { @@ -30,9 +80,11 @@ int main(int argc, char *argv[]) { typedef kaldi::int64 int64; const char *usage = - "Using a subset of training or held-out nnet3+chain examples, compute an\n" - "optimal combination of anumber of nnet3 neural nets by maximizing the\n" - "'chain' objective function. See documentation of options for more details.\n" + "Using a subset of training or held-out nnet3+chain examples, compute\n" + "the average over the first n nnet models where we maximize the\n" + "'chain' objective function for n. Note that the order of models has\n" + "been reversed before feeding into this binary. So we are actually\n" + "combining last n models.\n" "Inputs and outputs are nnet3 raw nnets.\n" "\n" "Usage: nnet3-chain-combine [options] ... \n" @@ -44,7 +96,6 @@ int main(int argc, char *argv[]) { bool batchnorm_test_mode = false, dropout_test_mode = true; std::string use_gpu = "yes"; - NnetCombineConfig combine_config; chain::ChainTrainingOptions chain_config; ParseOptions po(usage); @@ -57,7 +108,6 @@ int main(int argc, char *argv[]) { "If true, set test-mode to true on any DropoutComponents and " "DropoutMaskComponents."); - combine_config.Register(&po); chain_config.Register(&po); po.Read(argc, argv); @@ -83,6 +133,10 @@ int main(int argc, char *argv[]) { Nnet nnet; ReadKaldiObject(raw_nnet_rxfilename, &nnet); + Nnet moving_average_nnet(nnet), best_nnet(nnet); + NnetComputeProbOptions compute_prob_opts; + NnetChainComputeProb *prob_computer = new NnetChainComputeProb( + compute_prob_opts, chain_config, den_fst, moving_average_nnet); if (batchnorm_test_mode) SetBatchnormTestMode(true, &nnet); @@ -102,28 +156,36 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(!egs.empty()); } + int32 best_n = 1; + double best_objf = ComputeObjf(egs, prob_computer); + KALDI_LOG << "objective function using the last model is " << best_objf; int32 num_nnets = po.NumArgs() - 3; - NnetChainCombiner combiner(combine_config, chain_config, - num_nnets, egs, den_fst, nnet); for (int32 n = 1; n < num_nnets; n++) { std::string this_nnet_rxfilename = po.GetArg(n + 2); ReadKaldiObject(this_nnet_rxfilename, &nnet); - combiner.AcceptNnet(nnet); + double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet, + &moving_average_nnet, prob_computer); + KALDI_LOG << "Combining last " << n + 1 + << " models, objective function is " << objf; + if (objf > best_objf) { + best_objf = objf; + best_nnet = moving_average_nnet; + best_n = n + 1; + } } - combiner.Combine(); - - nnet = combiner.GetNnet(); if (HasBatchnorm(nnet)) - RecomputeStats(egs, chain_config, den_fst, &nnet); + RecomputeStats(egs, chain_config, den_fst, &best_nnet); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); #endif - WriteKaldiObject(nnet, nnet_wxfilename, binary_write); + WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write); + KALDI_LOG << "Using the model averaged over last " << best_n + << " models, objective function is " << best_objf; KALDI_LOG << "Finished combining neural nets, wrote model to " << nnet_wxfilename; diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index 128a9642ec4..e4181d5133b 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -1,6 +1,7 @@ // nnet3bin/nnet3-combine.cc // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2017 Yiming Wang // See ../../COPYING for clarification regarding multiple authors // @@ -19,8 +20,54 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" -#include "nnet3/nnet-combine.h" +#include "nnet3/nnet-utils.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-diagnostics.h" + + +namespace kaldi { +namespace nnet3 { + +double ComputeObjf(const std::vector &egs, + NnetComputeProb *prob_computer) { + prob_computer->Reset(); + std::vector::const_iterator iter = egs.begin(), + end = egs.end(); + for (; iter != end; ++iter) + prob_computer->Compute(*iter); + double tot_weights, + tot_objf = prob_computer->GetTotalObjective(&tot_weights); + KALDI_ASSERT(tot_weights > 0.0); + // we prefer to deal with normalized objective functions. + return tot_objf / tot_weights; +} + +// Note: the object that prob_computer.nnet_ refers to should be +// *moving_average_nnet. +double UpdateNnetMovingAverageAndComputeObjf(int32 num_models, + const std::vector &egs, + const Nnet &nnet, Nnet *moving_average_nnet, + NnetComputeProb *prob_computer) { + int32 num_params = NumParameters(nnet); + KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet)); + Vector nnet_params(num_params, kUndefined), + moving_average_nnet_params(num_params, kUndefined); + VectorizeNnet(nnet, &nnet_params); + VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params); + moving_average_nnet_params.Scale((num_models - 1.0) / num_models); + moving_average_nnet_params.AddVec(1.0 / num_models, nnet_params); + + BaseFloat sum = moving_average_nnet_params.Sum(); + // inf/nan parameters->return -inf objective. + if (!(sum == sum && sum - sum == 0)) + return -std::numeric_limits::infinity(); + + UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet); + return ComputeObjf(egs, prob_computer); +} +} +} int main(int argc, char *argv[]) { try { @@ -30,11 +77,13 @@ int main(int argc, char *argv[]) { typedef kaldi::int64 int64; const char *usage = - "Using a subset of training or held-out examples, compute an optimal combination of a\n" - "number of nnet3 neural nets by maximizing the objective function. See documentation of\n" - "options for more details. Inputs and outputs are 'raw' nnets.\n" + "Using a subset of training or held-out examples, compute the average\n" + "over the first n nnet3 models where we maxize the objective function\n" + "for n. Note that the order of models has been reversed before\n" + "feeding into this binary. So we are actually combining last n models.\n" + "Inputs and outputs are 'raw' nnets.\n" "\n" - "Usage: nnet3-combine [options] ... \n" + "Usage: nnet3-combine [options] ... \n" "\n" "e.g.:\n" " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n"; @@ -43,7 +92,6 @@ int main(int argc, char *argv[]) { bool batchnorm_test_mode = false, dropout_test_mode = true; std::string use_gpu = "yes"; - NnetCombineConfig combine_config; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); @@ -55,8 +103,6 @@ int main(int argc, char *argv[]) { po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); - combine_config.Register(&po); - po.Read(argc, argv); if (po.NumArgs() < 3) { @@ -75,6 +121,10 @@ int main(int argc, char *argv[]) { Nnet nnet; ReadKaldiObject(nnet_rxfilename, &nnet); + Nnet moving_average_nnet(nnet), best_nnet(nnet); + NnetComputeProbOptions compute_prob_opts; + NnetComputeProb *prob_computer = new NnetComputeProb(compute_prob_opts, + moving_average_nnet); if (batchnorm_test_mode) SetBatchnormTestMode(true, &nnet); @@ -94,24 +144,33 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(!egs.empty()); } + int32 best_n = 1; + double best_objf = ComputeObjf(egs, prob_computer); + KALDI_LOG << "objective function using the last model is " << best_objf; - int32 num_nnets = po.NumArgs() - 2; - if (num_nnets > 1 || !combine_config.enforce_sum_to_one) { - NnetCombiner combiner(combine_config, num_nnets, egs, nnet); - - for (int32 n = 1; n < num_nnets; n++) { + int32 num_inputs = po.NumArgs() - 2; + if (num_inputs > 1) { + for (int32 n = 1; n < num_inputs; n++) { ReadKaldiObject(po.GetArg(1 + n), &nnet); - combiner.AcceptNnet(nnet); + double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet, + &moving_average_nnet, prob_computer); + KALDI_LOG << "Combining last " << n + 1 + << " models, objective function is " << objf; + if (objf > best_objf) { + best_objf = objf; + best_nnet = moving_average_nnet; + best_n = n + 1; + } } - combiner.Combine(); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); #endif - nnet = combiner.GetNnet(); if (HasBatchnorm(nnet)) - RecomputeStats(egs, &nnet); - WriteKaldiObject(nnet, nnet_wxfilename, binary_write); + RecomputeStats(egs, &best_nnet); + WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write); + KALDI_LOG << "Using the model averaged over last " << best_n + << " models, objective function is " << best_objf; } else { KALDI_LOG << "Copying the single input model directly to the output, " << "without any combination."; From f41b0a2f3a6de583a92becd6e790364f0836edaa Mon Sep 17 00:00:00 2001 From: freewym Date: Sun, 10 Dec 2017 15:38:23 -0500 Subject: [PATCH 004/184] fix --- src/chainbin/nnet3-chain-combine.cc | 5 ++--- src/nnet3bin/nnet3-combine.cc | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 9100fbe3132..7dece5cb070 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -175,6 +175,8 @@ int main(int argc, char *argv[]) { best_n = n + 1; } } + KALDI_LOG << "Using the model averaged over last " << best_n + << " models, objective function is " << best_objf; if (HasBatchnorm(nnet)) RecomputeStats(egs, chain_config, den_fst, &best_nnet); @@ -184,9 +186,6 @@ int main(int argc, char *argv[]) { #endif WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write); - KALDI_LOG << "Using the model averaged over last " << best_n - << " models, objective function is " << best_objf; - KALDI_LOG << "Finished combining neural nets, wrote model to " << nnet_wxfilename; } catch(const std::exception &e) { diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index e4181d5133b..5d67715a228 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -162,6 +162,8 @@ int main(int argc, char *argv[]) { best_n = n + 1; } } + KALDI_LOG << "Using the model averaged over last " << best_n + << " models, objective function is " << best_objf; #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); @@ -169,8 +171,6 @@ int main(int argc, char *argv[]) { if (HasBatchnorm(nnet)) RecomputeStats(egs, &best_nnet); WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write); - KALDI_LOG << "Using the model averaged over last " << best_n - << " models, objective function is " << best_objf; } else { KALDI_LOG << "Copying the single input model directly to the output, " << "without any combination."; From 441c5df446de7a5944dbdeb0bf9c09fe30568e88 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 6 Dec 2017 17:12:30 -0500 Subject: [PATCH 005/184] [src] Nnet3 changes: add MemoryNormComponent; move BatchNorm,Normalize component code. --- src/nnet3/Makefile | 2 +- src/nnet3/nnet-component-itf.cc | 3 + src/nnet3/nnet-component-itf.h | 15 +- src/nnet3/nnet-normalize-component.cc | 1217 +++++++++++++++++++++++++ src/nnet3/nnet-normalize-component.h | 541 +++++++++++ src/nnet3/nnet-parse.cc | 7 +- src/nnet3/nnet-parse.h | 4 +- src/nnet3/nnet-simple-component.cc | 656 ------------- src/nnet3/nnet-simple-component.h | 256 +----- src/nnet3/nnet-test-utils.cc | 10 + src/nnet3/nnet-utils.cc | 1 + 11 files changed, 1794 insertions(+), 918 deletions(-) create mode 100644 src/nnet3/nnet-normalize-component.cc create mode 100644 src/nnet3/nnet-normalize-component.h diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 3236c52d60f..51dade98831 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \ nnet-common-test convolution-test attention-test OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \ - nnet-simple-component.o \ + nnet-simple-component.o nnet-normalize-component.o \ nnet-general-component.o nnet-parse.o natural-gradient-online.o \ nnet-descriptor.o nnet-optimize.o nnet-computation.o \ nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \ diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 82010fea58d..f83ad26f375 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -23,6 +23,7 @@ #include #include "nnet3/nnet-component-itf.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-normalize-component.h" #include "nnet3/nnet-general-component.h" #include "nnet3/nnet-convolutional-component.h" #include "nnet3/nnet-attention-component.h" @@ -163,6 +164,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new LstmNonlinearityComponent(); } else if (component_type == "BatchNormComponent") { ans = new BatchNormComponent(); + } else if (component_type == "MemoryNormComponent") { + ans = new MemoryNormComponent(); } else if (component_type == "TimeHeightConvolutionComponent") { ans = new TimeHeightConvolutionComponent(); } else if (component_type == "RestrictedAttentionComponent") { diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 62e09cee80f..565a7f25e74 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -351,20 +351,23 @@ class Component { /// although most components will have much more info. virtual std::string Info() const; - /// This virtual function when called by - // -- an UpdatableComponent scales the parameters + /// This virtual function when called on + /// -- an UpdatableComponent scales the parameters /// by "scale" when called by an UpdatableComponent. - // -- a Nonlinear component (or another component that - /// stores stats, like BatchNormComponent-- it relates + /// -- a Nonlinear component (or another component that + /// stores stats, like BatchNormComponent)-- it relates /// to scaling activation stats, not parameters. + /// Otherwise it will normally do nothing. virtual void Scale(BaseFloat scale) {}; /// This virtual function when called by /// -- an UpdatableComponent adds the parameters of /// another updatable component, times some constant, to the current /// parameters. - /// -- a NonlinearComponent it relates to adding stats - /// Otherwise it should do nothing. + /// -- a NonlinearComponent (or another component that stores + /// stats, like BatchNormComponent)-- it relates to adding + /// stats. + /// Otherwise it will normally do nothing. virtual void Add(BaseFloat alpha, const Component &other) {}; /// This virtual function only needs to be overwritten by Components that diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc new file mode 100644 index 00000000000..ac3817adfbe --- /dev/null +++ b/src/nnet3/nnet-normalize-component.cc @@ -0,0 +1,1217 @@ +// nnet3/nnet-normalize-component.cc + +// Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey) +// 2015 Guoguo Chen +// 2015 Daniel Galvez + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "nnet3/nnet-normalize-component.h" +#include "nnet3/nnet-parse.h" +#include "cudamatrix/cu-math.h" + +namespace kaldi { +namespace nnet3 { + +const BaseFloat NormalizeComponent::kSquaredNormFloor = + pow(2.0, NormalizeComponent::kExpSquaredNormFloor); + +NormalizeComponent::NormalizeComponent(const NormalizeComponent &other): + input_dim_(other.input_dim_), block_dim_(other.block_dim_), + target_rms_(other.target_rms_), + add_log_stddev_(other.add_log_stddev_) { } + +void NormalizeComponent::InitFromConfig(ConfigLine *cfl) { + input_dim_ = 0; + add_log_stddev_ = false; + target_rms_ = 1.0; + bool ok = cfl->GetValue("dim", &input_dim_) || + cfl->GetValue("input-dim", &input_dim_); + block_dim_ = input_dim_; + cfl->GetValue("block-dim", &block_dim_); + cfl->GetValue("target-rms", &target_rms_); + cfl->GetValue("add-log-stddev", &add_log_stddev_); + if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 || + block_dim_ <= 0 || input_dim_ % block_dim_ != 0) + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; +} + +void NormalizeComponent::Read(std::istream &is, bool binary) { + std::string token; + ReadToken(is, binary, &token); + if (token == "") { + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == "" || token == ""); + ReadBasicType(is, binary, &input_dim_); // Read dimension. + ReadToken(is, binary, &token); + if (token == "") { + ReadBasicType(is, binary, &block_dim_); + ReadToken(is, binary, &token); + } else { + block_dim_ = input_dim_; + } + // read target_rms_ if it is available. + if (token == "") { + ReadBasicType(is, binary, &target_rms_); + ReadToken(is, binary, &token); + } + // Read add_log_stddev_ token, if it is available. + if (token == "") { + ReadBasicType(is, binary, &add_log_stddev_); + ReadToken(is, binary, &token); + } + if (token == "") { + // back-compatibility code. + CuVector temp; + temp.Read(is, binary); + ExpectToken(is, binary, ""); + temp.Read(is, binary); + ExpectToken(is, binary, ""); + double count; + ReadBasicType(is, binary, &count); + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == ""); +} + +void NormalizeComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, input_dim_); + if (block_dim_ != input_dim_) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + } + WriteToken(os, binary, ""); + WriteBasicType(os, binary, target_rms_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, add_log_stddev_); + WriteToken(os, binary, ""); +} + +std::string NormalizeComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", input-dim=" << InputDim() + << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_ + << ", add-log-stddev=" << std::boolalpha << add_log_stddev_; + if (block_dim_ != input_dim_) + stream << ", block-dim=" << block_dim_; + return stream.str(); +} + +// The output y_i = scale * x_i, +// and we want to RMS value of the y_i to equal target_rms, +// so y^t y = D * target_rms^2 (if y is one row of the input). +// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). +// there is also flooring involved, to avoid division-by-zero +// problems. It's important for the backprop, that the floor's +// square root is exactly representable as float. +// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) +// is an extra dimension of the output. +void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() && + in.NumRows() == out->NumRows()); + if (block_dim_ != input_dim_) { + int32 num_blocks = input_dim_ / block_dim_, + new_num_rows = in.NumRows() * num_blocks, + output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); + KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); + CuSubMatrix in_reshaped(in.Data(), new_num_rows, + block_dim_, block_dim_), + out_reshaped(out->Data(), new_num_rows, + output_block_dim, output_block_dim); + cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_, + &out_reshaped); + } else { + cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out); + } + return NULL; +} + +/* + A note on the derivative of NormalizeComponent... + let both row_in and row_out be vectors of dimension D. + Let p = row_in^T row_in / (D * target_rms^2), and let + f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as: + row_out = f row_in. + Suppose we have a quantity deriv_out which is the derivative + of the objective function w.r.t. row_out. We want to compute + deriv_in which is the derivative of the objective function w.r.t. + row_in. Let the objective function be F. One term is obvious: we have + deriv_in = f deriv_out + .... + next we have to take into account the derivative that gets back-propagated + through f. Obviously, dF/df = deriv_out^T row_in. + And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3), + and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued]. + So this term in dF/d(row_in) equals: + dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in + So + deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in + + if add_log_stddev_ true, the deriv_in has another term as + dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x) +*/ +void NormalizeComponent::Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + if (!in_deriv) + return; + if (block_dim_ != input_dim_) { + int32 num_blocks = input_dim_ / block_dim_, + new_num_rows = in_value.NumRows() * num_blocks, + output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); + KALDI_ASSERT(in_value.Stride() == in_value.NumCols() && + out_deriv.Stride() == out_deriv.NumCols() && + in_deriv->Stride() == in_deriv->NumCols()); + CuSubMatrix in_value_reshaped(in_value.Data(), new_num_rows, + block_dim_, block_dim_), + out_deriv_reshaped(out_deriv.Data(), new_num_rows, + output_block_dim, output_block_dim), + in_deriv_reshaped(in_deriv->Data(), new_num_rows, + block_dim_, block_dim_); + cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_, + add_log_stddev_, &in_deriv_reshaped); + } else { + cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_, + in_deriv); + } +} + +void BatchNormComponent::ComputeDerived() { + if (!test_mode_) { + offset_.Resize(0); + scale_.Resize(0); + return; + } + + if (count_ == 0.0) { + KALDI_WARN << "Test-mode is set but there is no data count. " + "Creating random counts. This only makes sense " + "in unit-tests (or compute_prob_*.0.log). If you see this " + "elsewhere, something is very wrong."; + count_ = 1.0; + stats_sum_.SetRandn(); + stats_sumsq_.SetRandn(); + stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); + } + + offset_.Resize(block_dim_); + scale_.Resize(block_dim_); + offset_.CopyFromVec(stats_sum_); + offset_.Scale(-1.0 / count_); + // now offset_ is -mean. + scale_.CopyFromVec(stats_sumsq_); + scale_.Scale(1.0 / count_); + scale_.AddVecVec(-1.0, offset_, offset_, 1.0); + // now scale_ is variance. + // Mathematically the ApplyFloor statement should be a no-op; this is in case + // of numerical roundoff. + scale_.ApplyFloor(0.0); + scale_.Add(epsilon_); + scale_.ApplyPow(-0.5); + // now scale_ = min(variance, epsilon)^{-0.5}. + // next, multiply by the target RMS (normally 1.0). + scale_.Scale(target_rms_); + offset_.MulElements(scale_); + // now offset_ is -(scale*mean). +} + +void BatchNormComponent::SetTestMode(bool test_mode) { + test_mode_ = test_mode; + ComputeDerived(); +} + +void BatchNormComponent::Check() const { + KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0.0 && target_rms_ > 0.0); +} + +BatchNormComponent::BatchNormComponent(const BatchNormComponent &other): + dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), + target_rms_(other.target_rms_), test_mode_(other.test_mode_), + count_(other.count_), stats_sum_(other.stats_sum_), + stats_sumsq_(other.stats_sumsq_) { + ComputeDerived(); + Check(); +} + + +std::string BatchNormComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ + << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ + << ", count=" << count_ + << ", test-mode=" << (test_mode_ ? "true" : "false"); + if (count_ > 0) { + Vector mean(stats_sum_), var(stats_sumsq_); + mean.Scale(1.0 / count_); + var.Scale(1.0 / count_); + // subtract mean^2 from var. + var.AddVecVec(-1.0, mean, mean, 1.0); + var.ApplyFloor(0.0); + var.ApplyPow(0.5); // make it the stddev. + stream << ", data-mean=" << SummarizeVector(mean) + << ", data-stddev=" << SummarizeVector(var); + } + return stream.str(); +} + +void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { + dim_ = -1; + block_dim_ = -1; + epsilon_ = 1.0e-03; + target_rms_ = 1.0; + test_mode_ = false; + bool ok = cfl->GetValue("dim", &dim_); + cfl->GetValue("block-dim", &block_dim_); + cfl->GetValue("epsilon", &epsilon_); + cfl->GetValue("target-rms", &target_rms_); + cfl->GetValue("test-mode", &test_mode_); + if (!ok || dim_ <= 0) { + KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0"; + } + if (block_dim_ == -1) + block_dim_ = dim_; + if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0 && target_rms_ > 0)) + KALDI_ERR << "Invalid configuration in BatchNormComponent."; + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + count_ = 0; + stats_sum_.Resize(block_dim_); + stats_sumsq_.Resize(block_dim_); + if (test_mode_) { + ComputeDerived(); + } +} + + + +/* + BATCH_NORM_MATH + + This comment describes the equations involved in batch normalization, and + derives the forward and back-propagation. + + This is all dimension-by-dimension, so we just imagine the inputs + are scalars x(i), for i=0 .. n-1. + + FORWARD PASS: + + Define xsum = sum_i x(i) + x2sum = sum_i x(i)^2 + mean = xsum / n + var = x2sum / n - (mean*mean) + scale = sqrt(var + epsilon)^{-0.5} + offset = -mean * scale + + y(i) = scale * x(i) + offset + + Most of the rest of this comment derives how to compute the derivatives. If + you just want the formulas, please skip to the string 'BACKWARD PASS' below. + + We'll use a notation where an apostrophe on something means (the derivative of + the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on. + We are given y'(i). Propagating the derivatives backward: + offset' = sum_i y'(i) + scale' = (sum_i y'(i) * x(i)) - offset' * mean + var' = scale' * -0.5 * sqrt(var + epsilon)^{-1.5} + = -0.5 * scale' * scale^3 + mean' = -offset' * scale - 2 * mean * var' + xsum' = mean' / n + x2sum' = var' / n + + So the derivatives propagated back to the original data are: + x'(i) = y'(i) * scale + xsum' + x(i) * x2sum' + + The above is quite complicated to compute, but we can use some invariances + to work out a simpler way to compute the derivatives. + + Firstly, note that x'(i) is of the form: + + x'(i) = y'(i) * scale + [affine function of x(i)]. + + [it's a 1-d affine function, i.e. offset and scale]. + This has the same functional form as: + + x'(i) = y'(i) * scale + [affine function of y(i)]. + + since y(i) is an affine function of x(i) with nonzero scale. + Because the output is invariant to shifts in the input, sum_i x'(i) + will be zero. This is sufficient to determine the bias + term in the affine function. [Note: the scale on y(i) doesn't + come into it because the y(i) sum to zero]. The offset + will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero. + So let's write it as + + x'(i) = (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i). + + and it will be convenient to define: + + x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale + + which is just y'(i) with mean subtraction, scaled according to + the scale used in the normalization. So write + + x'(i) = x_deriv_base(i) + alpha y(i). + + The question is, what is the scale alpha. We don't actually need to + do any differentiation to figure this out. First, assume there is + no "+ epsilon" in the variance; later we'll explain why this doesn't + matter. The key to working out alpha is that the output is invariant + to scaling of the input. Assume we scale around the input's mean, + since that makes the math simpler. We can express this by the + constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0. This is + equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since + y(i) is x(i) - avg-x times a nonzero scale. We'll use this contraint + to determine alpha, Using the above expressionfor x(i), we can write + this constraint as: + \sum_i ( y(i) x_deriv_base(i) + alpha y(i) y(i)) = 0. + Now, since we said we'd ignore the epsilon, the output has unit variance, + so we know that \sum_i y(i) y(i) = n. + So alpha = - \sum_i y(i) x_deriv_base(i) / n. We can actually re-imagine + the epsilon term (or variance-flooring) as having been implemented by + adding a couple extra rows to the matrix with suitable values, and zero + output-deriv for those rows. If you think about it carefully you'll see that + the formula above is valid even if there is an extra term + in the variance. Anyway the correctness of the derivative will get tested + throughly by the component unit-tests. + + So to recap, here is the backprop. + + BACKWARD PASS: + + We are given y'(i), scale, and y(i). + + We compute: + x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale + alpha = - \sum_i y(i) x_deriv_base(i) / n + x'(i) = x_deriv_base(i) + alpha y(i) + */ + + + +void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(SameDim(in, *out) && + (in.NumCols() == dim_ || in.NumCols() == block_dim_)); + if (in.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); + int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), + orig_cols = in.NumCols(), new_rows = orig_rows * ratio, + new_cols = orig_cols / ratio; + CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), + out_reshaped(out->Data(), new_rows, new_cols, new_cols); + return Propagate(indexes, in_reshaped, &out_reshaped); + } + + // From this point, we can assume that the num-cols of 'in' and 'out' + // equals block_dim_. + + if (!test_mode_) { + // search in the comment above for FORWARD PASS to see what is being + // implemented here. + // if this takes too much time due to multiple different CUDA calls, + // we'll consider making a single kernel for some of it. + Memo *memo = new Memo; + int32 num_frames = in.NumRows(), dim = block_dim_; + memo->num_frames = num_frames; + memo->mean_uvar_scale.Resize(4, dim); + CuSubVector mean(memo->mean_uvar_scale, 0), + uvar(memo->mean_uvar_scale, 1), + scale(memo->mean_uvar_scale, 2); + mean.AddRowSumMat(1.0 / num_frames, in, 0.0); + uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); + scale.CopyFromVec(uvar); + // by applying this scale at this point, we save a multiply later on. + BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); + scale.AddVecVec(-var_scale, mean, mean, var_scale); + // at this point, 'scale' contains just the variance [divided by target-rms^2]. + scale.ApplyFloor(0.0); + scale.Add(var_scale * epsilon_); + // Now 'scale' contains the variance floored to zero and then with epsilon + // added [both divided by target-rms^2]. + scale.ApplyPow(-0.5); + // now 'scale' is the actual scale we'll use. + + // the next command will do no work if out == in, for in-place propagation. + out->CopyFromMat(in); + out->AddVecToRows(-1.0, mean, 1.0); + out->MulColsVec(scale); + return static_cast(memo); + } else { + if (offset_.Dim() != block_dim_) { + if (count_ == 0) + KALDI_ERR << "Test mode set in BatchNormComponent, but no stats."; + else // why was ComputeDerived() not called? + KALDI_ERR << "Code error in BatchNormComponent"; + } + out->CopyFromMat(in); + out->MulColsVec(scale_); + out->AddVecToRows(1.0, offset_, 1.0); + return NULL; + } +} + +void BatchNormComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, // unused + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo_in, + Component *to_update, // unused + CuMatrixBase *in_deriv) const { + + KALDI_ASSERT(SameDim(out_value, out_deriv) && + SameDim(out_value, *in_deriv) && + (out_value.NumCols() == dim_ || + out_value.NumCols() == block_dim_)); + if (out_value.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_value.Stride() == out_value.NumCols() && + out_deriv.Stride() == out_deriv.NumCols() && + in_deriv->Stride() == in_deriv->NumCols()); + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols), + out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), + in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); + // we'll never use in_value, so pass it in unchanged. + Backprop(debug_info, indexes, in_value, + out_value_reshaped, out_deriv_reshaped, + memo_in, to_update, &in_deriv_reshaped); + return; + } + + Memo *memo = static_cast(memo_in); + + if (!test_mode_) { + // search above for BACKWARD PASS for a comment describing the math. + KALDI_ASSERT(memo != NULL && "memo not passed into backprop"); + int32 num_frames = memo->num_frames; + KALDI_ASSERT(out_value.NumRows() == num_frames); + CuSubVector temp(memo->mean_uvar_scale, 3), + scale(memo->mean_uvar_scale, 2); + temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); + // the following does no work if in_deriv and out_deriv are the same matrix. + in_deriv->CopyFromMat(out_deriv); + in_deriv->AddVecToRows(1.0, temp); + in_deriv->MulColsVec(scale); + // at this point, 'in_deriv' contains: + // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale + temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_), + out_value, kTrans, *in_deriv, kNoTrans, 0.0); + // now, 'temp' contains the quantity which we described + // in the math as: + // alpha = - \sum_i y(i) x_deriv_base(i) / n. + // The factor 1 / (target_rms_ * target_rms_) comes from following + // this additional scaling factor through the math. In the comment I said + // "we know that \sum_i y(i) y(i) = n". Taking target-rms into account + // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2". + in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0); + // At this point, in_deriv contains x'(i) = x_deriv_base(i) + alpha y(i). + + } else { + KALDI_ASSERT(offset_.Dim() == block_dim_); + // the next call does no work if they point to the same memory. + in_deriv->CopyFromMat(out_deriv); + in_deriv->MulColsVec(scale_); + } +} + +void BatchNormComponent::StoreStats( + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo_in) { + // in test mode this component does not store stats, it doesn't provide the + // kStoresStats flag. + KALDI_ASSERT(!test_mode_); + KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_); + if (out_value.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols); + // we'll never use in_value, so just pass it in unchanged. + StoreStats(in_value, out_value_reshaped, memo_in); + return; + } + + Memo *memo = static_cast(memo_in); + KALDI_ASSERT(out_value.NumRows() == memo->num_frames); + + CuSubVector mean(memo->mean_uvar_scale, 0), + uvar(memo->mean_uvar_scale, 1); + KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0); + BaseFloat num_frames = memo->num_frames; + if (stats_sum_.Dim() != block_dim_) { + stats_sum_.Resize(block_dim_); + stats_sumsq_.Resize(block_dim_); + KALDI_ASSERT(count_ == 0); + } + count_ += num_frames; + stats_sum_.AddVec(num_frames, mean, 1.0); + stats_sumsq_.AddVec(num_frames, uvar, 1.0); +} + +void BatchNormComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &block_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &epsilon_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &target_rms_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &test_mode_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); + ExpectToken(is, binary, ""); + stats_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + stats_sumsq_.Read(is, binary); + stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); + stats_sum_.Scale(count_); + stats_sumsq_.Scale(count_); + ExpectToken(is, binary, ""); + ComputeDerived(); + Check(); +} + +void BatchNormComponent::Write(std::ostream &os, bool binary) const { + Check(); + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, epsilon_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, target_rms_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, test_mode_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + CuVector mean(stats_sum_), var(stats_sumsq_); + if (count_ != 0) { + mean.Scale(1.0 / count_); + var.Scale(1.0 / count_); + var.AddVecVec(-1.0, mean, mean, 1.0); + } + WriteToken(os, binary, ""); + mean.Write(os, binary); + WriteToken(os, binary, ""); + var.Write(os, binary); + WriteToken(os, binary, ""); +} + +void BatchNormComponent::Scale(BaseFloat scale) { + if (scale == 0) { + count_ = 0.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + } else { + count_ *= scale; + stats_sum_.Scale(scale); + stats_sumsq_.Scale(scale); + } +} + + +void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) { + const BatchNormComponent *other = + dynamic_cast(&other_in); + count_ += alpha * other->count_; + stats_sum_.AddVec(alpha, other->stats_sum_); + stats_sumsq_.AddVec(alpha, other->stats_sumsq_); + // this operation might change offset_ and scale_, so we recompute them + // in this instance (but not in Scale()). + ComputeDerived(); +} + +void BatchNormComponent::ZeroStats() { + // We only zero the stats if we're not in test mode. In test mode, this would + // be dangerous as the stats are the source for the transform, and zeroing + // them and then calling ComputeDerived() again would remove the transform + // parameters (offset_ and scale_). + if (!test_mode_) { + count_ = 0.0; + stats_sum_.SetZero(); + stats_sumsq_.SetZero(); + } +} + + + + +/** + MEMORY_NORM_MATH + + This comment describes the equations involved in 'memory-norm'. + memory-norm is like batch normalization, except instead of computing + everything on the current minibatch, we deal with decaying averages + over time, interpreted as expectations. We'll firm up the math later. + The idea is to obtain a form of batch-norm that is compatible with + use in recurrent neural nets. + + Everything is dimension by dimension here, so let's imagine the input and + output are one-dimensional. Any index 'i' is going to be like a frame index + or an index referring to a sample. We'll be writing down some expectations, + and we're rather cavalier with notation; these basically mean + exponentially-decaying weighted averages over time. + + The input will be x(i), and the output y(i). + + Each frame will have a weight, w(i) >= 0. (these will be part of the + decaying averages)... + + Let's define + count = \sum_i w(i) + sum = \sum_i w(i) x(i) + sumsq = \sum_i w(i) x(i)^2 + + We can compute: + mean = sum / count + var = epsilon + (sumsq / count) - (mean * mean) + scale = var^{-0.5} + + y(i) = (x(i) - mean) * scale. + + We are given the derivatives of the objective function w.r.t. the + outputs; we'll write these as y'(i) [CAUTION: this is nonstandard + notation. An apostrophe on something means the derivative of the + objective function w.r.t. that thing]. + + Over this data, with these weights, we can compute the derivative + of the objective w.r.t. the mean and the scale: + + mean' = -scale * \sum_i w(i) y'(i) + scale' = \sum_i w(i) y'(i) (x(i) - mean) + = 1/scale \sum_i w(i) y'(i) y(i) + var' = -0.5 var^{-1.5} scale' + = -0.5 var^{-1} \sum_i w(i) y'(i) y(i) + + It will be convenient to write down 'per-frame' versions of all of these + quantities, which are divided by the total count: + mean_norm' = mean' / count + scale_norm' = scale' / count + var_norm' = var' / count + (we keep the apostrophe on these quantities as it clarifies that they + are derivatives of the objective function w.r.t something). + + Now, 'var' can be written as: + var = epsilon + (1/count) \sum_i w(i) (x(i) - mean)^2 + and the following formula is more convenient to propagate the derivative + back to an x(i). + Note: the following has 3 terms, which we can think of as + "direct term" (given fixed mean and scale), + "term via mean" (term that comes via derivative of the mean) + "term via scale" (term that comes via derivative of the scale) + + + x'(i) = y'(i)*scale + mean_norm' + 2 var_norm' (x(i) - mean) + = y'(i)*scale + mean_norm' + 2 var_norm' y(i) / scale + = y'(i)*scale + mean_norm' - y(i) * scale/count * \sum_i w(i) y'(i) y(i) + + I'm afraid I just pulled the above out of thin air... needs some more + derivation. The part about (x(i) - mean) can be obtained, I believe, + from computation of the derivative of the variance w.r.t. the x(i) values. + +*/ + + +void MemoryNormComponent::SetTestMode(bool test_mode) { + test_mode_ = test_mode; +} + +void MemoryNormComponent::Check() const { + KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0.0 && target_rms_ > 0.0 && + stats_count_ >= 0.0 && backward_count_ >= 0.0); + +} + +MemoryNormComponent::MemoryNormComponent(const MemoryNormComponent &other): + dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), + target_rms_(other.target_rms_), + include_indirect_derivative_(other.include_indirect_derivative_), + test_mode_(other.test_mode_), + stats_count_(other.stats_count_), backward_count_(other.backward_count_), + data_(other.data_) { + Check(); +} + + +std::string MemoryNormComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ + << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ + << ", include-indirect-derivative=" + << (include_indirect_derivative_ ? "true" : "false") + << ", stats-count=" << stats_count_ << ", backward-count=" + << backward_count_ + << ", test-mode=" << (test_mode_ ? "true" : "false"); + if (stats_count_ > 0.0) { + CuSubVector x_mean(data_, 0), + y_deriv(data_, 2), y_deriv_y(data_, 3), + scale(data_, 4), x_deriv(data_, 5), + scale_deriv(data_, 6); + if (stats_count_ > 0.0) + stream << ", x-mean=" << SummarizeVector(x_mean) + << ", scale=" << SummarizeVector(scale); + if (backward_count_ > 0.0) + stream << ", y-deriv=" << SummarizeVector(y_deriv) + << ", y-deriv-y=" << SummarizeVector(y_deriv_y) + << ", x-deriv=" << SummarizeVector(x_deriv) + << ", scale-deriv=" << SummarizeVector(scale_deriv); + } + return stream.str(); +} + +void MemoryNormComponent::InitFromConfig(ConfigLine *cfl) { + dim_ = -1; + block_dim_ = -1; + epsilon_ = 1.0e-03; + target_rms_ = 1.0; + include_indirect_derivative_ = true; + test_mode_ = false; + + bool ok = cfl->GetValue("dim", &dim_); + cfl->GetValue("block-dim", &block_dim_); + cfl->GetValue("epsilon", &epsilon_); + cfl->GetValue("target-rms", &target_rms_); + cfl->GetValue("include-indirect-derivative", &include_indirect_derivative_); + cfl->GetValue("test-mode", &test_mode_); + if (!ok || dim_ <= 0) { + KALDI_ERR << "MemoryNormComponent must have 'dim' specified, and > 0"; + } + if (block_dim_ == -1) + block_dim_ = dim_; + if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && + epsilon_ > 0 && target_rms_ > 0)) + KALDI_ERR << "Invalid configuration in MemoryNormComponent."; + if (cfl->HasUnusedValues()) + KALDI_ERR << "Could not process these elements in initializer: " + << cfl->UnusedValues(); + stats_count_ = 0.0; + backward_count_ = 0.0; + data_.Resize(7, block_dim_); +} + + + +void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const { + KALDI_ASSERT(SameDim(in, *out) && + (in.NumCols() == dim_ || in.NumCols() == block_dim_)); + if (in.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); + int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), + orig_cols = in.NumCols(), new_rows = orig_rows * ratio, + new_cols = orig_cols / ratio; + CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), + out_reshaped(out->Data(), new_rows, new_cols, new_cols); + return Propagate(indexes, in_reshaped, &out_reshaped); + } + + if (out->Data() != in.Data()) + out->CopyFromMat(in); + + // From this point, we can assume that the num-cols of 'in' and 'out' + // equals block_dim_. + if (test_mode_) { + if (stats_count_ <= 0.0) + KALDI_ERR << "Test mode set but no stats available."; + CuSubVector x_mean(data_, 3), scale(data_, 4); + out->AddVecToRows(-1.0, x_mean); + out->MulColsVec(scale); + return NULL; + } else { + Memo *memo = GetMemo(in); + CuSubVector x_sum(memo->data, 0), + scale(memo->data, 2); + out->AddVecToRows(-1.0 / memo->num_frames, x_sum); + out->MulColsVec(scale); + return memo; + } +} + + +MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( + const CuMatrixBase &in) const { + KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_); + Memo *memo = new Memo; + int32 num_frames = in.NumRows(); + memo->num_frames = num_frames; + memo->data.Resize(5, block_dim_); + CuSubVector x_sum(memo->data, 0), + x_sumsq(memo->data, 1); + x_sum.AddRowSumMat(1.0, in, 0.0); + x_sumsq.AddDiagMat2(1.0, in, kTrans, 0.0); + if (stats_count_ > 0.0) { + memo->has_indirect_terms = include_indirect_derivative_; + if (include_indirect_derivative_) { + // copy over scale, x_deriv and scale_deriv. + memo->data.RowRange(2, 3).CopyFromMat(data_.RowRange(4, 3)); + } else { + // just copy over the scale. x_deriv and scale_deriv remain zero. + memo->data.Row(2).CopyFromVec(data_.Row(4)); + } + } else { + // We should only reach this point on when processing the first + // minibatch of each training job. + + // note: 'x_deriv' and 'scale_deriv' will be zero. This means we're + // ignoring the smaller, indirect term in the derivative for the first + // minibatch of each training job. That indirect term is really not that + // important that we should worry much about this. + memo->has_indirect_terms = false; + + CuSubVector scale(memo->data, 2); + scale.CopyFromVec(x_sumsq); + scale.AddVecVec(-1.0 / (num_frames * 1.0 * num_frames), + x_sum, x_sum, 1.0 / num_frames); + // At this point 'scale' is the variance. + // We apply the floor at 0.0 as a failsafe for problems caused by roundoff. + scale.ApplyFloor(0.0); + scale.Add(epsilon_); + // At this point 'scale' is the variance plus epsilon. + scale.ApplyPow(-0.5); + // OK, now 'scale' is the actual scale: the inverse standard deviation. + } + return memo; +} + +void MemoryNormComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, // unused. + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo_in, + Component *to_update_in, + CuMatrixBase *in_deriv) const { + + KALDI_ASSERT(SameDim(out_deriv, *in_deriv) && + (out_deriv.NumCols() == dim_ || + out_deriv.NumCols() == block_dim_)); + if (out_deriv.NumCols() != block_dim_) { + // if block_dim_ != dim_, we recurse; this helps keep the main code + // simple. + KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() && + in_deriv->Stride() == in_deriv->NumCols()); + if (out_value.NumRows() != 0) { + KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); + } + int32 ratio = dim_ / block_dim_, + orig_rows = out_value.NumRows(), + orig_cols = out_value.NumCols(), + new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; + CuSubMatrix + out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), + in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); + + // we'll never use in_value, so pass it in unchanged. + if (out_value.NumRows() != 0) { + CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, + new_cols, new_cols); + Backprop(debug_info, indexes, in_value, + out_value_reshaped, out_deriv_reshaped, + memo_in, to_update_in, &in_deriv_reshaped); + } else { + Backprop(debug_info, indexes, in_value, + out_value, out_deriv_reshaped, + memo_in, to_update_in, &in_deriv_reshaped); + } + return; + } + + // assume in_deriv is non-NULL, because a non-updatable Component will not + // have the backprop called if the in_deriv is non-NULL. + + if (test_mode_) { + KALDI_ASSERT(memo_in == NULL && stats_count_ != 0.0); + // the following is a no-op if in_deriv and out_deriv are the same matrix. + in_deriv->CopyFromMat(out_deriv); + CuSubVector scale(data_, 4); + in_deriv->MulColsVec(scale); + return; + } + + // OK, we're not in test mode. + // Before computing 'in_deriv', we may need to store some stats. + if (include_indirect_derivative_ && to_update_in != NULL) { + // Store some stats which are necessary to compute the 'indirect derivative' + // term (this is analogous to the part of the derivative in regular backprop + // that comes from the objf derivative w.r.t. the mean and variance stats). + // + // Note: instead of simply adding to the stats 'y_deriv' and 'y_deriv_y', + // the following equations do a kind of weighted combination, because + // these stats are stored normalized by the total count (backward_count_). + MemoryNormComponent *to_update = + dynamic_cast(to_update_in); + BaseFloat backward_count = to_update->backward_count_, + num_frames = in_deriv->NumRows(), + new_backward_count = backward_count + num_frames, + old_weight = backward_count / new_backward_count; + CuSubVector y_deriv(to_update->data_, 2), + y_deriv_y(to_update->data_, 3); + // The factor 1.0 / new_backward_count that appears below can be perhaps more + // clearly written as follows: first define + // new_weight = num_frames / new_backward_count + // and then write new_weight / num_frames, which simplifies to + // 1.0 / new_backward_count. The factor of 1.0 / num_frames is necessary to + // convert from data sums to a per-frame average. + y_deriv.AddRowSumMat(1.0 / new_backward_count, out_deriv, old_weight); + y_deriv_y.AddDiagMatMat(1.0 / new_backward_count, out_deriv, kTrans, + out_value, kNoTrans, old_weight); + to_update->backward_count_ = new_backward_count; + // We don't bother calling to_update->ComputeDerived()-- although it would + // be harmless-- because in the current situations where this code is + // reached, to_update will be the delta_nnet_, and the derived parameters of + // delta_nnet_ aren't used. + + // to_update->ComputeDerived(); + } + + // the following does no work if in_deriv and out_deriv are the same matrix. + in_deriv->CopyFromMat(out_deriv); + + Memo *memo = static_cast(memo_in); + CuSubVector scale(memo->data, 2); + in_deriv->MulColsVec(scale); + if (memo->has_indirect_terms) { + CuSubVector x_deriv(memo->data, 3), + scale_deriv(memo->data, 4); + in_deriv->AddVecToRows(-1.0, x_deriv); + in_deriv->AddMatDiagVec(-1.0, out_value, kNoTrans, scale_deriv); + } +} + + +void MemoryNormComponent::ComputeDerived() { + KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 7); + if (stats_count_ == 0.0) { + // zero 'scale', 'x_deriv' and 'scale_deriv'. + data_.RowRange(4, 3).SetZero(); + return; + } + CuSubVector x_mean(data_, 0), x_uvar(data_, 1), + y_deriv(data_, 2), y_deriv_y(data_, 3), scale(data_, 4); + scale.CopyFromVec(x_uvar); + scale.AddVecVec(-1.0, x_mean, x_mean, 1.0); + // at this point, 'scale' is the variance. + scale.ApplyFloor(0.0); + scale.Add(epsilon_); + scale.ApplyPow(-0.5); + if (backward_count_ == 0.0) { + // The following statement sets x_deriv and scale_deriv to zero. + data_.RowRange(5, 2).SetZero(); + } else { + // The following statement sets x_deriv = y_deriv * scale, + // and scale_deriv = y_deriv_y * scale. + data_.RowRange(5, 2).AddMatDiagVec(1.0, + data_.RowRange(2, 2), kNoTrans, scale, 0.0); + } +} + +void MemoryNormComponent::StoreStats( + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + void *memo_in) { + // in test mode this component does not store stats; it doesn't provide the + // kStoresStats flag so this function won't be called. + KALDI_ASSERT(!test_mode_ && memo_in != NULL && stats_count_ >= 0.0); + + // We don't actually need 'in_value' and 'out_value', as the + // required statistics are already stored in 'memo_in'. + Memo *memo = static_cast(memo_in); + + BaseFloat num_frames = memo->num_frames, + old_stats_count = stats_count_, + new_stats_count = num_frames + old_stats_count, + old_weight = old_stats_count / new_stats_count; + + // x_mean_and_x_uvar is the first 2 rows of data_. + CuSubMatrix x_mean_and_x_uvar(data_, 0, 2, 0, block_dim_); + // x_sum_and_x_sumsq is the first 2 rows of data_. + CuSubMatrix x_sum_and_x_sumsq(memo->data, 0, 2, 0, block_dim_); + + x_mean_and_x_uvar.Scale(old_weight); + // The factor 1.0 / new_stats_count that appears below can be perhaps more + // clearly written as follows: first define + // new_weight = num_frames / new_stats_count + // and then write 'new_weight / num_frames', which simplifies to + // '1.0 / new_stats_count'. The factor of '1.0 / num_frames' + // is necessary to convert from data sums to a per-frame average. + x_mean_and_x_uvar.AddMat(1.0 / new_stats_count, x_sum_and_x_sumsq); + stats_count_ = new_stats_count; + ComputeDerived(); +} + +void MemoryNormComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &block_dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &epsilon_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &target_rms_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &include_indirect_derivative_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &test_mode_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &stats_count_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &backward_count_); + ExpectToken(is, binary, ""); + data_.Read(is, binary); + Check(); +} + +void MemoryNormComponent::Write(std::ostream &os, bool binary) const { + Check(); + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, block_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, epsilon_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, target_rms_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, include_indirect_derivative_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, test_mode_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, stats_count_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, backward_count_); + WriteToken(os, binary, ""); + data_.Write(os, binary); + WriteToken(os, binary, ""); +} + +void MemoryNormComponent::Scale(BaseFloat scale) { + if (scale <= 0) { + if (scale < 0.0) + KALDI_WARN << "Setting stats to zero in MemoryNormComponent: requested scale = " + << scale; + // If scale is negative we zero the stats. This may not always be the right + // thing to do, so we warn. + data_.SetZero(); + stats_count_ = 0.0; + backward_count_ = 0.0; + } else { + stats_count_ *= scale; + backward_count_ *= scale; + // 'data_' doesnt need to be changed, as all the quantities it contains are + // normalized by the count. + } +} + + +void MemoryNormComponent::Add(BaseFloat alpha, const Component &other_in) { + const MemoryNormComponent *other = + dynamic_cast(&other_in); + + static bool warned = false; + if (alpha < 0.0) { + if (!warned) { + warned = true; + KALDI_WARN << "Adding MemoryNormComponent with negative scale: will do nothing " + << "(will not warn again)."; + } + return; + } + + BaseFloat + new_stats_count = stats_count_ + alpha * other->stats_count_, + new_backward_count = backward_count_ + alpha * other->backward_count_; + + if (new_stats_count > 0.0) { + // This block sets rows 0 and 1 of data_, which we call 'x_mean' and + // 'x_uvar, to the appropriate weighted combination of 'this' and 'other'. + BaseFloat this_scale = stats_count_ / new_stats_count, + other_scale = alpha * other->stats_count_ / new_stats_count; + data_.RowRange(0, 2).Scale(this_scale); + data_.RowRange(0, 2).AddMat(other_scale, other->data_.RowRange(0, 2)); + } + if (new_backward_count > 0.0) { + // This block sets rows 2 and 3 of data_, which we call 'y_deriv' and + // 'y_deriv_y', to the appropriate weighted combination of 'this' and + // 'other'. + BaseFloat this_scale = backward_count_ / new_backward_count, + other_scale = alpha * other->backward_count_ / new_backward_count; + data_.RowRange(2, 2).Scale(this_scale); + data_.RowRange(2, 2).AddMat(other_scale, other->data_.RowRange(2, 2)); + } + stats_count_ = new_stats_count; + backward_count_ = new_backward_count; + ComputeDerived(); +} + +void MemoryNormComponent::ZeroStats() { + // We only zero the stats if we're not in test mode. In test mode, this would + // be dangerous as the stats aren't really considered to be stats, they become + // a fixed part of the model. + if (!test_mode_) { + stats_count_ = 0.0; + backward_count_ = 0.0; + data_.SetZero(); + } +} + + + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h new file mode 100644 index 00000000000..68506174eb7 --- /dev/null +++ b/src/nnet3/nnet-normalize-component.h @@ -0,0 +1,541 @@ +// nnet3/nnet-normalize-component.h + +// Copyright 2011-2013 Karel Vesely +// 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2013 Xiaohui Zhang +// 2014-2015 Vijayaditya Peddinti +// 2014-2015 Guoguo Chen +// 2015 Daniel Galvez +// 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_ +#define KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_ + +#include "nnet3/nnet-common.h" +#include "nnet3/nnet-component-itf.h" +#include "nnet3/natural-gradient-online.h" +#include + +namespace kaldi { +namespace nnet3 { + +/// @file nnet-normalize-component.h +/// +/// This file contains declarations of components that in one way or +/// another normalize their input: NormalizeComponent, BatchNormComponent, +/// and MemoryNormComponent. + +/* + Implements the function: + + y = x * (sqrt(dim(x)) * target-rms) / |x| + + where |x| is the 2-norm of the vector x. I.e. its output is its input + scaled such that the root-mean-square values of its elements equals + target-rms. (As a special case, if the input is zero, it outputs zero). + + Note: if you specify add-log-stddev=true, it adds an extra element to + y which equals log(|x| / sqrt(dim(x))). + + + Configuration values accepted: + dim, or input-dim Input dimension of this component, e.g. 1024. + Will be the same as the output dimension if add-log-stddev=false. + block-dim Defaults to 'dim' you may specify a nonzero divisor + of 'dim'. In this case the input dimension will + be interpreted as blocks of dimension 'block-dim' + to which the nonlinearity described above is applied + separately. + add-log-stddev You can set this to true to add an extra output + dimension which will equal |x| / sqrt(dim(x)). + If block-dim is specified, this is done per block. + target-rms This defaults to 1.0, but if set it to another + (nonzero) value, the output will be scaled by this + factor. + */ +class NormalizeComponent: public Component { + public: + explicit NormalizeComponent(const NormalizeComponent &other); + + virtual int32 Properties() const { + return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds| + (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) | + (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0); + } + NormalizeComponent() { } + virtual std::string Type() const { return "NormalizeComponent"; } + virtual void InitFromConfig(ConfigLine *cfl); + virtual Component* Copy() const { return new NormalizeComponent(*this); } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual int32 InputDim() const { return input_dim_; } + virtual int32 OutputDim() const { + return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0)); + } + virtual std::string Info() const; + private: + NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow. + enum { kExpSquaredNormFloor = -66 }; + // kSquaredNormFloor is about 0.7e-20. We need a value that's exactly representable in + // float and whose inverse square root is also exactly representable + // in float (hence, an even power of two). + static const BaseFloat kSquaredNormFloor; + int32 input_dim_; + int32 block_dim_; + BaseFloat target_rms_; // The target rms for outputs, default 1.0. + + bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D))) + // is an extra dimension of the output. +}; + + +/* + BatchNormComponent + + This implements batch normalization; for each dimension of the + input it normalizes the data to be zero-mean, unit-variance. You + can set the block-dim configuration value to implement spatial + batch normalization, see the comment for the variable. + + If you want to combine this with the trainable offset and scale that the + original BatchNorm paper used, then follow this by the + ScaleAndOffsetComponent. + + It's a simple component (uses the kSimpleComponent flag), but it is unusual in + that it will give different results if you call it on half the matrix at a + time. Most of the time this would be pretty harmless, so we still return the + kSimpleComponent flag. We may have to modify the test code a little to + account for this, or possibly remove the kSimpleComponent flag. In some sense + each output Index depends on every input Index, but putting those dependencies + explicitly into the dependency-tracking framework as a GeneralComponent + would be very impractical and might lead to a lot of unnecessary things being + computed. You have to be a bit careful where you put this component, and understand + what you're doing e.g. putting it in the path of a recurrence is a bit problematic + if the minibatch size is small. + + Accepted configuration values: + dim Dimension of the input and output + block-dim Defaults to 'dim', but may be set to a nonzero divisor + of 'dim'. In this case, each block of dimension 'block-dim' + is treated like a separate row of the input matrix, which + means that the stats from n'th element of each + block are pooled into one class, for each n.a + epsilon Small term added to the variance that is used to prevent + division by zero + target-rms This defaults to 1.0, but if set, for instance, to 2.0, + it will normalize the standard deviation of the output to + 2.0. 'target-stddev' might be a more suitable name, but this + was chosen for consistency with NormalizeComponent. + */ +class BatchNormComponent: public Component { + public: + + BatchNormComponent() { } + + // call this with 'true' to set 'test mode' where the batch normalization is + // done with stored stats. There won't normally be any need to specially + // accumulate these stats; they are stored as a matter of course on each + // iteration of training, as for NonlinearComponents, and we'll use the stats + // from the most recent [script-level] iteration. + void SetTestMode(bool test_mode); + + // constructor using another component + BatchNormComponent(const BatchNormComponent &other); + + virtual int32 InputDim() const { return dim_; } + virtual int32 OutputDim() const { return dim_; } + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "BatchNormComponent"; } + virtual int32 Properties() const { + // If the block-dim is less than the dim, we need the input and output + // matrices to be contiguous (stride==num-cols), as we'll be reshaping + // internally. This is not much of a cost, because this will be used + // in convnets where we have to do this anyway. + return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace| + kBackpropInPlace| + (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)| + (test_mode_ ? 0 : kUsesMemo|kStoresStats); + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *, // to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + virtual Component* Copy() const { return new BatchNormComponent(*this); } + + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void ZeroStats(); + + + virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } + + virtual void StoreStats(const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + void *memo); + + // Members specific to this component type. + // Note: the offset and scale will only be nonempty in 'test mode'. + const CuVector &Offset() const { return offset_; } + const CuVector &Scale() const { return scale_; } + + private: + + struct Memo { + // number of frames (after any reshaping). + int32 num_frames; + // 'sum_sumsq_scale' is of dimension 4 by block_dim_: + // Row 0 = mean = the mean of the rows of the input + // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). + // Row 2 = scale = the scale of the renormalization, which is + // Row 3 is used as a temporary in Backprop. + // the inverse stddev of the input (modified by epsilon_, + // see the Propagate function. + CuMatrix mean_uvar_scale; + }; + + void Check() const; + + // this function is used in a couple of places; it turns the raw stats into + // the offset/scale term of a normalizing transform. + static void ComputeOffsetAndScale(double count, + BaseFloat epsilon, + const Vector &stats_sum, + const Vector &stats_sumsq, + Vector *offset, + Vector *scale); + // computes derived parameters offset_ and scale_. + void ComputeDerived(); + + // Dimension of the input and output. + int32 dim_; + // This would normally be the same as dim_, but if it's less (and it must be > + // 0 and must divide dim_), then each separate block of the input of dimension + // 'block_dim_' is treated like a separate frame for the purposes of + // normalization. This can be used to implement spatial batch normalization + // for convolutional setups-- assuming the filter-dim has stride 1, which it + // always will in the new code in nnet-convolutional-component.h. + int32 block_dim_; + + // Used to avoid exact-zero variances, epsilon has the dimension of a + // covariance. + BaseFloat epsilon_; + + // This value will normally be 1.0, which is the default, but you can set it + // to other values as a way to control how fast the following layer learns + // (smaller -> slower). The same config exists in NormalizeComponent. + BaseFloat target_rms_; + + // This is true if we want the batch normalization to operate in 'test mode' + // meaning the data mean and stddev used for the normalization are fixed + // quantities based on previously accumulated stats. Note: the stats we use + // for this are based on the same 'StoreStats' mechanism as we use for + // components like SigmoidComponent and ReluComponent; we'll be using + // the stats from the most recent [script-level] iteration of training. + bool test_mode_; + + + // total count of stats stored by StoreStats(). + double count_; + // sum-of-data component of stats of input data. + CuVector stats_sum_; + // sum-of-squared component of stats of input data. + CuVector stats_sumsq_; + + // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they + // dictate the transform that is done in 'test mode'. They are set only when + // reading the model from disk and when calling SetTestMode(true); they are + // resized to empty when the stats are updated, to ensure that out-of-date + // values are not kept around. + CuVector offset_; + CuVector scale_; +}; + + +/* + MemoryNormComponent + + MemoryNormComponent is like batch normalization, except the stats + are accumulated as a weighted sum over past minibatches (if this is + not the first minibatch), instead of over the current minibatch. + + You can use it in the same way you would normally use BatchNormComponent. + + Accepted configuration values: + dim Dimension of the input and output + block-dim Defaults to 'dim', but may be set to a nonzero divisor + of 'dim'. In this case, each block of dimension 'block-dim' + is treated like a separate row of the input matrix, which + means that the stats from n'th element of each + block are pooled into one class, for each n.a + epsilon Small term added to the variance that is used to prevent + division by zero + target-rms This defaults to 1.0, but if set, for instance, to 2.0, + it will normalize the standard deviation of the output to + 2.0. 'target-stddev' might be a more suitable name, but this + was chosen for consistency with NormalizeComponent. + include-indirect-derivative This defaults to true, which means we + include the (smaller) derivative term that comes via the + mean and variance estimation. You might want to set this to + false for testing purposes. + */ +class MemoryNormComponent: public Component { + public: + + MemoryNormComponent() { } + + // constructor using another component + MemoryNormComponent(const MemoryNormComponent &other); + + virtual int32 InputDim() const { return dim_; } + virtual int32 OutputDim() const { return dim_; } + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "MemoryNormComponent"; } + virtual int32 Properties() const { + // If the block-dim is less than the dim, we need the input and output + // matrices to be contiguous (stride==num-cols), as we'll be reshaping + // internally. This is not much of a cost, because this will be used + // in convnets where we have to do this anyway. + bool iid = include_indirect_derivative_; + return kSimpleComponent|kPropagateInPlace|kBackpropInPlace| + (test_mode_ ? 0 : kUsesMemo|kStoresStats|(iid?kBackpropNeedsOutput:0))| + (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0); + + } + + // Call this function to set 'test mode' to true or false. In test + // mode the stats are frozen and will not be updated. + void SetTestMode(bool test_mode); + + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + + /// The backprop function. In addition to propagating the input back to + /// 'in_deriv', if supplied, this function also updates, in 'to_update', + /// backward_count_ and the rows named 'y_deriv' and 'y_deriv_y' of + /// data_, and also the derived quantities 'x_deriv' and 'scale_deriv' + /// of data_. + /// (note: in training, 'to_update' will point to delta_nnet_, and later these + /// stats get added to nnet_ via Add()) + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, // in_value + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); // This Read function + // requires that the Component has the correct type. + + /// Write component to stream + virtual void Write(std::ostream &os, bool binary) const; + virtual Component* Copy() const { return new MemoryNormComponent(*this); } + + // Note: if you scale by a negative number it will set stats to zero + // rather than allow a negative stats count. + virtual void Scale(BaseFloat scale); + // Note: if you try to add with negative coefficient (as in backstitch), it + // will do nothing. + virtual void Add(BaseFloat alpha, const Component &other); + virtual void ZeroStats(); + + virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } + + /// This function updates stats_count_, the rows named 'x_mean', 'x_uvar' + /// of data_, and also the derived quantities stored in the rows named + /// 'scale', 'x_deriv' and 'scale_deriv' of data_. + /// (note: in training, this is called on the delta_nnet_, and later + /// the stats get added to nnet_ via Add()) + virtual void StoreStats(const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + void *memo); + + private: + + struct Memo { + // The number of frames (after any reshaping; so in general it will + // be the original NumRows() of the matrix, times dim_ / block_dim_). + int32 num_frames; + // 'data' is of dimension 5 by block_dim_. + // Row 0, which we'll call 'x_sum', is the sum of the rows of the + // input data. + // Row 1, which we'll call 'x_sumsq', is the sum of the rows of the + // elementwise square of the input data matrix. + // Row 2,3,4 are 'scale', 'x_deriv', 'scale_deriv', which + // are just copies of the corresponding values in + // MemoryNormComponent::data_ (from the const nnet, the one we're + // training), and which will have been copied from there when this + // object was created. However if stats_count_ was <= 0 when this + // object was created (first minibatch), then 'scale' + // will be set to the mean and inverse-stddev implied by the stats + // 'sum' and 'sumsq', and 'x_deriv' and 'scale_deriv' will be zero. + // This is so that it does something sensible on the very first + // minibatch we train. The reason why we copy these quantities here + // is because in the backprop phase we feel it would be better to + // use the same values that were used in the forward propagation, + // instead of the possibly-updated values that might exist when + // Backprop() is called. It's actually not clear whether this is + // necessary. + CuMatrix data; + + // This is set to true if we have the 'indirect' terms in the derivative, + // relating to the 'x_deriv' and 'scale_deriv' terms in 'data'. If false, + // we save some computation. + bool has_indirect_terms; + }; + + + /// This piece of code, which has been broken out from Propagate(), computes + /// the memo. Expects in.NumCols() == block_dim_. It should only be called + /// if test_mode_ is false. + Memo *GetMemo(const CuMatrixBase &in) const; + + /// This function computes certain members of data_ that are derived: + /// specifically, rows 4, 5 and 6, which are called 'scale', 'x_deriv' and + /// 'scale_deriv'. + void ComputeDerived(); + + void Check() const; + + // this function is used in a couple of places; it turns the raw stats into + // the offset/scale term of a normalizing transform. + static void ComputeOffsetAndScale(BaseFloat count, + BaseFloat epsilon, + const Vector &stats_sum, + const Vector &stats_sumsq, + Vector *offset, + Vector *scale); + + // Dimension of the input and output. + int32 dim_; + + // block_dim_ would normally be the same as dim_, but if it's less (and it + // must be > 0 and must divide dim_), then each separate block of the input of + // dimension 'block_dim_' is treated like a separate frame for the purposes of + // normalization. This can be used to implement spatial batch normalization + // for convolutional setups-- assuming the filter-dim has stride 1, which it + // always will in the new code in nnet-convolutional-component.h. + int32 block_dim_; + + // Used to avoid exact-zero variances, epsilon has the dimension of a + // covariance. + BaseFloat epsilon_; + + // This controls the dynamic range of the output. At 1.0 which is the + // default, the output has unit standard deviation, but you can set it to + // other values. The same config exists in NormalizeComponent. + BaseFloat target_rms_; + + // If true, we include the smaller indirect part of the derivative, that comes + // via the stats estimation. This is included mostly for testing purposes; we + // expect this will normally be true. + bool include_indirect_derivative_; + + // If test_mode_ is set, no stats will be accumulated. It's an error if + // test_mode_ is set and the data count is zero, and you try to propagate. + bool test_mode_; + + // The total count of stats stored by StoreStats(), and which are represented + // in x_mean = data_.Row(0) and x_uvar = data_.Row(1). We never allow this to + // become less than zero, even if people do unexpected things with Add() and + // Scale(). + BaseFloat stats_count_; + + // backward_count_ is the total count of stats accumulated during backprop, + // and represents the count correspondsing to the stats in 'y_deriv' and + // 'y_deriv_y'. It is expected to be either zero or the same as stats_count_, + // in most circumstances, depending whether you were doing backprop or just + // inference-- but we don't enforce this because there may be situations where + // this is not the case. + // + // We never allow this to become less than zero, even if people do unexpected + // things with Add() and Scale(). + BaseFloat backward_count_; + + // We store data_ as a single matrix because it enables certain operations + // to be done using fewer kernels, but it contains various different quantities, + // which we'll describe below as if they were separate variables. + // data_ is of dimension 6 by block_dim_. + CuMatrix data_; + // data_.Row(0) is 'x_mean', which is the decaying moving-average of + // input data x; or zero if stats_count_ is zero. + // data_.Row(1) is 'x_uvar', which is the decaying moving-average of + // input data x^2 or zero if stats_count_ is zero. + // data_.Row(2) is 'y_deriv', which is the decaying moving-average + // derivative of the objective w.r.t. the output y; or + // zero if backward_count_ is zero. + // data_.Row(3) is 'y_deriv_y', which the decaying moving average + // of the product of the output times (the derivative of the + // objective w.r.t. the output); or zero if backward_count_ + // is zero. + // + // The quantities below are derived from the stats above. + // + // data_.Row(4) is 'scale', which is the inverse square root of the + // covariance computed from x_mean and x_uvar (plus epsilon), + // or zero if stats_count_ is zero. + // data_.Row(5) is 'x_deriv', which is the negative of the average derivative + // (per frame) of the objective function w.r.t the input x (just the + // part that comes via the derivative w.r.t. the x mean). + // 'x_deriv' equals 'y_deriv' times 'scale'. + // data_.Row(6) is 'scale_deriv', which relates to the part of the + // derivative w.r.t. the input that comes from the objf + // derivative w.r.t. the scale. It equals scale * y_deriv_y. +}; + + + + + +} // namespace nnet3 +} // namespace kaldi + + +#endif diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index 2c4da825013..6dd2873bd81 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -481,7 +481,7 @@ static void PrintFloatSuccinctly(std::ostream &os, BaseFloat f) { // Returns a string that summarizes a vector fairly succintly, for // printing stats in info lines. -std::string SummarizeVector(const Vector &vec) { +std::string SummarizeVector(const VectorBase &vec) { std::ostringstream os; if (vec.Dim() < 10) { os << "[ "; @@ -517,6 +517,11 @@ std::string SummarizeVector(const Vector &vec) { return os.str(); } +std::string SummarizeVector(const CuVectorBase &cu_vec) { + Vector vec(cu_vec); + return SummarizeVector(vec); +} + void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuVectorBase ¶ms, diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index fef21301ff6..7f1380bf253 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -191,7 +191,9 @@ std::string ErrorContext(const std::string &str); // Returns a string that summarizes a vector fairly succintly, for // printing stats in info lines. -std::string SummarizeVector(const Vector &vec); +std::string SummarizeVector(const VectorBase &vec); + +std::string SummarizeVector(const CuVectorBase &vec); /** Print to 'os' some information about the mean and standard deviation of some parameters, used in Info() functions in nnet-simple-component.cc. diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index d6c4e2163bf..e76f7cae2a7 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -313,179 +313,6 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); } -const BaseFloat NormalizeComponent::kSquaredNormFloor = - pow(2.0, NormalizeComponent::kExpSquaredNormFloor); - -NormalizeComponent::NormalizeComponent(const NormalizeComponent &other): - input_dim_(other.input_dim_), block_dim_(other.block_dim_), - target_rms_(other.target_rms_), - add_log_stddev_(other.add_log_stddev_) { } - -void NormalizeComponent::InitFromConfig(ConfigLine *cfl) { - input_dim_ = 0; - add_log_stddev_ = false; - target_rms_ = 1.0; - bool ok = cfl->GetValue("dim", &input_dim_) || - cfl->GetValue("input-dim", &input_dim_); - block_dim_ = input_dim_; - cfl->GetValue("block-dim", &block_dim_); - cfl->GetValue("target-rms", &target_rms_); - cfl->GetValue("add-log-stddev", &add_log_stddev_); - if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 || - block_dim_ <= 0 || input_dim_ % block_dim_ != 0) - KALDI_ERR << "Invalid initializer for layer of type " - << Type() << ": \"" << cfl->WholeLine() << "\""; -} - -void NormalizeComponent::Read(std::istream &is, bool binary) { - std::string token; - ReadToken(is, binary, &token); - if (token == "") { - ReadToken(is, binary, &token); - } - KALDI_ASSERT(token == "" || token == ""); - ReadBasicType(is, binary, &input_dim_); // Read dimension. - ReadToken(is, binary, &token); - if (token == "") { - ReadBasicType(is, binary, &block_dim_); - ReadToken(is, binary, &token); - } else { - block_dim_ = input_dim_; - } - // read target_rms_ if it is available. - if (token == "") { - ReadBasicType(is, binary, &target_rms_); - ReadToken(is, binary, &token); - } - // Read add_log_stddev_ token, if it is available. - if (token == "") { - ReadBasicType(is, binary, &add_log_stddev_); - ReadToken(is, binary, &token); - } - if (token == "") { - // back-compatibility code. - CuVector temp; - temp.Read(is, binary); - ExpectToken(is, binary, ""); - temp.Read(is, binary); - ExpectToken(is, binary, ""); - double count; - ReadBasicType(is, binary, &count); - ReadToken(is, binary, &token); - } - KALDI_ASSERT(token == ""); -} - -void NormalizeComponent::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, input_dim_); - if (block_dim_ != input_dim_) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, block_dim_); - } - WriteToken(os, binary, ""); - WriteBasicType(os, binary, target_rms_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, add_log_stddev_); - WriteToken(os, binary, ""); -} - -std::string NormalizeComponent::Info() const { - std::ostringstream stream; - stream << Type() << ", input-dim=" << InputDim() - << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_ - << ", add-log-stddev=" << std::boolalpha << add_log_stddev_; - if (block_dim_ != input_dim_) - stream << ", block-dim=" << block_dim_; - return stream.str(); -} - -// The output y_i = scale * x_i, -// and we want to RMS value of the y_i to equal target_rms, -// so y^t y = D * target_rms^2 (if y is one row of the input). -// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). -// there is also flooring involved, to avoid division-by-zero -// problems. It's important for the backprop, that the floor's -// square root is exactly representable as float. -// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) -// is an extra dimension of the output. -void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() && - in.NumRows() == out->NumRows()); - if (block_dim_ != input_dim_) { - int32 num_blocks = input_dim_ / block_dim_, - new_num_rows = in.NumRows() * num_blocks, - output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); - KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); - CuSubMatrix in_reshaped(in.Data(), new_num_rows, - block_dim_, block_dim_), - out_reshaped(out->Data(), new_num_rows, - output_block_dim, output_block_dim); - cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_, - &out_reshaped); - } else { - cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out); - } - return NULL; -} - -/* - A note on the derivative of NormalizeComponent... - let both row_in and row_out be vectors of dimension D. - Let p = row_in^T row_in / (D * target_rms^2), and let - f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as: - row_out = f row_in. - Suppose we have a quantity deriv_out which is the derivative - of the objective function w.r.t. row_out. We want to compute - deriv_in which is the derivative of the objective function w.r.t. - row_in. Let the objective function be F. One term is obvious: we have - deriv_in = f deriv_out + .... - next we have to take into account the derivative that gets back-propagated - through f. Obviously, dF/df = deriv_out^T row_in. - And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3), - and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued]. - So this term in dF/d(row_in) equals: - dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in - So - deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in - - if add_log_stddev_ true, the deriv_in has another term as - dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x) -*/ -void NormalizeComponent::Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const { - if (!in_deriv) - return; - if (block_dim_ != input_dim_) { - int32 num_blocks = input_dim_ / block_dim_, - new_num_rows = in_value.NumRows() * num_blocks, - output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0); - KALDI_ASSERT(in_value.Stride() == in_value.NumCols() && - out_deriv.Stride() == out_deriv.NumCols() && - in_deriv->Stride() == in_deriv->NumCols()); - CuSubMatrix in_value_reshaped(in_value.Data(), new_num_rows, - block_dim_, block_dim_), - out_deriv_reshaped(out_deriv.Data(), new_num_rows, - output_block_dim, output_block_dim), - in_deriv_reshaped(in_deriv->Data(), new_num_rows, - block_dim_, block_dim_); - cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_, - add_log_stddev_, &in_deriv_reshaped); - } else { - cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_, - in_deriv); - } -} - void* SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { @@ -5880,489 +5707,6 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { } } - - -void BatchNormComponent::ComputeDerived() { - if (!test_mode_) { - offset_.Resize(0); - scale_.Resize(0); - return; - } - - if (count_ == 0.0) { - KALDI_WARN << "Test-mode is set but there is no data count. " - "Creating random counts. This only makes sense " - "in unit-tests (or compute_prob_*.0.log). If you see this " - "elsewhere, something is very wrong."; - count_ = 1.0; - stats_sum_.SetRandn(); - stats_sumsq_.SetRandn(); - stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); - } - - offset_.Resize(block_dim_); - scale_.Resize(block_dim_); - offset_.CopyFromVec(stats_sum_); - offset_.Scale(-1.0 / count_); - // now offset_ is -mean. - scale_.CopyFromVec(stats_sumsq_); - scale_.Scale(1.0 / count_); - scale_.AddVecVec(-1.0, offset_, offset_, 1.0); - // now scale_ is variance. - // Mathematically the ApplyFloor statement should be a no-op; this is in case - // of numerical roundoff. - scale_.ApplyFloor(0.0); - scale_.Add(epsilon_); - scale_.ApplyPow(-0.5); - // now scale_ = min(variance, epsilon)^{-0.5}. - // next, multiply by the target RMS (normally 1.0). - scale_.Scale(target_rms_); - offset_.MulElements(scale_); - // now offset_ is -(scale*mean). -} - -void BatchNormComponent::SetTestMode(bool test_mode) { - test_mode_ = test_mode; - ComputeDerived(); -} - -void BatchNormComponent::Check() const { - KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && - epsilon_ > 0.0 && target_rms_ > 0.0); -} - -BatchNormComponent::BatchNormComponent(const BatchNormComponent &other): - dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), - target_rms_(other.target_rms_), test_mode_(other.test_mode_), - count_(other.count_), stats_sum_(other.stats_sum_), - stats_sumsq_(other.stats_sumsq_) { - ComputeDerived(); - Check(); -} - - -std::string BatchNormComponent::Info() const { - std::ostringstream stream; - stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ - << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ - << ", count=" << count_ - << ", test-mode=" << (test_mode_ ? "true" : "false"); - if (count_ > 0) { - Vector mean(stats_sum_), var(stats_sumsq_); - mean.Scale(1.0 / count_); - var.Scale(1.0 / count_); - // subtract mean^2 from var. - var.AddVecVec(-1.0, mean, mean, 1.0); - var.ApplyFloor(0.0); - var.ApplyPow(0.5); // make it the stddev. - stream << ", data-mean=" << SummarizeVector(mean) - << ", data-stddev=" << SummarizeVector(var); - } - return stream.str(); -} - -void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { - dim_ = -1; - block_dim_ = -1; - epsilon_ = 1.0e-03; - target_rms_ = 1.0; - test_mode_ = false; - bool ok = cfl->GetValue("dim", &dim_); - cfl->GetValue("block-dim", &block_dim_); - cfl->GetValue("epsilon", &epsilon_); - cfl->GetValue("target-rms", &target_rms_); - cfl->GetValue("test-mode", &test_mode_); - if (!ok || dim_ <= 0) { - KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0"; - } - if (block_dim_ == -1) - block_dim_ = dim_; - if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && - epsilon_ > 0 && target_rms_ > 0)) - KALDI_ERR << "Invalid configuration in BatchNormComponent."; - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - count_ = 0; - stats_sum_.Resize(block_dim_); - stats_sumsq_.Resize(block_dim_); - if (test_mode_) { - ComputeDerived(); - } -} - - - -/* - BATCH_NORM_MATH - - This comment describes the equations involved in batch normalization, and - derives the forward and back-propagation. - - This is all dimension-by-dimension, so we just imagine the inputs - are scalars x(i), for i=0 .. n-1. - - FORWARD PASS: - - Define xsum = sum_i x(i) - x2sum = sum_i x(i)^2 - mean = xsum / n - var = x2sum / n - (mean*mean) - scale = (var + epsilon)^{-0.5} - offset = -mean * scale - - y(i) = scale * x(i) + offset - - Most of the rest of this comment derives how to compute the derivatives. If - you just want the formulas, please skip to the string 'BACKWARD PASS' below. - - We'll use a notation where an apostrophe on something means (the derivative of - the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on. - We are given y'(i). Propagating the derivatives backward: - offset' = sum_i y'(i) - scale' = (sum_i y'(i) * x(i)) - offset' * mean - var' = scale' * -0.5 * (var + epsilon)^{-1.5} - = -0.5 * scale' * scale^3 - mean' = -offset' * scale - 2 * mean * var' - xsum' = mean' / n - x2sum' = var' / n - - So the derivatives propagated back to the original data are: - x'(i) = y'(i) * scale + xsum' + x(i) * x2sum' - - The above is quite complicated to compute, but we can use some invariances - to work out a simpler way to compute the derivatives. - - Firstly, note that x'(i) is of the form: - - x'(i) = y'(i) * scale + [affine function of x(i)]. - - [it's a 1-d affine function, i.e. offset and scale]. - This has the same functional form as: - - x'(i) = y'(i) * scale + [affine function of y(i)]. - - since y(i) is an affine function of x(i) with nonzero scale. - Because the output is invariant to shifts in the input, sum_i x'(i) - will be zero. This is sufficient to determine the bias - term in the affine function. [Note: the scale on y(i) doesn't - come into it because the y(i) sum to zero]. The offset - will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero. - So let's write it as - - x'(i) = (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i). - - and it will be convenient to define: - - x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - - which is just y'(i) with mean subtraction, scaled according to - the scale used in the normalization. So write - - x'(i) = x_deriv_base(i) + alpha y(i). - - The question is, what is the scale alpha. We don't actually need to - do any differentiation to figure this out. First, assume there is - no "+ epsilon" in the variance; later we'll explain why this doesn't - matter. The key to working out alpha is that the output is invariant - to scaling of the input. Assume we scale around the input's mean, - since that makes the math simpler. We can express this by the - constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0. This is - equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since - y(i) is x(i) - avg-x times a nonzero scale. We'll use this contraint - to determine alpha, Using the above expressionfor x(i), we can write - this constraint as: - \sum_i ( y(i) x_deriv_base(i) + alpha y(i) y(i)) = 0. - Now, since we said we'd ignore the epsilon, the output has unit variance, - so we know that \sum_i y(i) y(i) = n. - So alpha = - \sum_i y(i) x_deriv_base(i) / n. We can actually re-imagine - the epsilon term (or variance-flooring) as having been implemented by - adding a couple extra rows to the matrix with suitable values, and zero - output-deriv for those rows. If you think about it carefully you'll see that - the formula above is valid even if there is an extra term - in the variance. Anyway the correctness of the derivative will get tested - throughly by the component unit-tests. - - So to recap, here is the backprop. - - BACKWARD PASS: - - We are given y'(i), scale, and y(i). - - We compute: - x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - alpha = - \sum_i y(i) x_deriv_base(i) / n - x'(i) = x_deriv_base(i) + alpha y(i) - */ - - - -void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(SameDim(in, *out) && - (in.NumCols() == dim_ || in.NumCols() == block_dim_)); - if (in.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); - int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), - orig_cols = in.NumCols(), new_rows = orig_rows * ratio, - new_cols = orig_cols / ratio; - CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), - out_reshaped(out->Data(), new_rows, new_cols, new_cols); - return Propagate(indexes, in_reshaped, &out_reshaped); - } - - // From this point, we can assume that the num-cols of 'in' and 'out' - // equals block_dim_. - - if (!test_mode_) { - // search in the comment above for FORWARD PASS to see what is being - // implemented here. - // if this takes too much time due to multiple different CUDA calls, - // we'll consider making a single kernel for some of it. - Memo *memo = new Memo; - int32 num_frames = in.NumRows(), dim = block_dim_; - memo->num_frames = num_frames; - memo->mean_uvar_scale.Resize(4, dim); - CuSubVector mean(memo->mean_uvar_scale, 0), - uvar(memo->mean_uvar_scale, 1), - scale(memo->mean_uvar_scale, 2); - mean.AddRowSumMat(1.0 / num_frames, in, 0.0); - uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); - scale.CopyFromVec(uvar); - // by applying this scale at this point, we save a multiply later on. - BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); - scale.AddVecVec(-var_scale, mean, mean, var_scale); - // at this point, 'scale' contains just the variance [divided by target-rms^2]. - scale.ApplyFloor(0.0); - scale.Add(var_scale * epsilon_); - // Now 'scale' contains the variance floored to zero and then with epsilon - // added [both divided by target-rms^2]. - scale.ApplyPow(-0.5); - // now 'scale' is the actual scale we'll use. - - // the next command will do no work if out == in, for in-place propagation. - out->CopyFromMat(in); - out->AddVecToRows(-1.0, mean, 1.0); - out->MulColsVec(scale); - return static_cast(memo); - } else { - if (offset_.Dim() != block_dim_) { - if (count_ == 0) - KALDI_ERR << "Test mode set in BatchNormComponent, but no stats."; - else // why was ComputeDerived() not called? - KALDI_ERR << "Code error in BatchNormComponent"; - } - out->CopyFromMat(in); - out->MulColsVec(scale_); - out->AddVecToRows(1.0, offset_, 1.0); - return NULL; - } -} - -void BatchNormComponent::Backprop( - const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, // unused - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo_in, - Component *to_update, // unused - CuMatrixBase *in_deriv) const { - - KALDI_ASSERT(SameDim(out_value, out_deriv) && - SameDim(out_value, *in_deriv) && - (out_value.NumCols() == dim_ || - out_value.NumCols() == block_dim_)); - if (out_value.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(out_value.Stride() == out_value.NumCols() && - out_deriv.Stride() == out_deriv.NumCols() && - in_deriv->Stride() == in_deriv->NumCols()); - int32 ratio = dim_ / block_dim_, - orig_rows = out_value.NumRows(), - orig_cols = out_value.NumCols(), - new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; - CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, - new_cols, new_cols), - out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), - in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); - // we'll never use in_value, so pass it in unchanged. - Backprop(debug_info, indexes, in_value, - out_value_reshaped, out_deriv_reshaped, - memo_in, to_update, &in_deriv_reshaped); - return; - } - - Memo *memo = static_cast(memo_in); - - if (!test_mode_) { - // search above for BACKWARD PASS for a comment describing the math. - KALDI_ASSERT(memo != NULL && "memo not passed into backprop"); - int32 num_frames = memo->num_frames; - KALDI_ASSERT(out_value.NumRows() == num_frames); - CuSubVector temp(memo->mean_uvar_scale, 3), - scale(memo->mean_uvar_scale, 2); - temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); - // the following does no work if in_deriv and out_deriv are the same matrix. - in_deriv->CopyFromMat(out_deriv); - in_deriv->AddVecToRows(1.0, temp); - in_deriv->MulColsVec(scale); - // at this point, 'in_deriv' contains: - // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_), - out_value, kTrans, *in_deriv, kNoTrans, 0.0); - // now, 'temp' contains the quantity which we described - // in the math as: - // alpha = - \sum_i y(i) x_deriv_base(i) / n. - // The factor 1 / (target_rms_ * target_rms_) comes from following - // this additional scaling factor through the math. In the comment I said - // "we know that \sum_i y(i) y(i) = n". Taking target-rms into account - // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2". - in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0); - // At this point, in_deriv contains x'(i) = x_deriv_base(i) + alpha y(i). - - } else { - KALDI_ASSERT(offset_.Dim() == block_dim_); - // the next call does no work if they point to the same memory. - in_deriv->CopyFromMat(out_deriv); - in_deriv->MulColsVec(scale_); - } -} - -void BatchNormComponent::StoreStats( - const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - void *memo_in) { - // in test mode this component does not store stats, it doesn't provide the - // kStoresStats flag. - KALDI_ASSERT(!test_mode_); - KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_); - if (out_value.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); - int32 ratio = dim_ / block_dim_, - orig_rows = out_value.NumRows(), - orig_cols = out_value.NumCols(), - new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; - CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, - new_cols, new_cols); - // we'll never use in_value, so just pass it in unchanged. - StoreStats(in_value, out_value_reshaped, memo_in); - return; - } - - Memo *memo = static_cast(memo_in); - KALDI_ASSERT(out_value.NumRows() == memo->num_frames); - - CuSubVector mean(memo->mean_uvar_scale, 0), - uvar(memo->mean_uvar_scale, 1); - KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0); - BaseFloat num_frames = memo->num_frames; - if (stats_sum_.Dim() != block_dim_) { - stats_sum_.Resize(block_dim_); - stats_sumsq_.Resize(block_dim_); - KALDI_ASSERT(count_ == 0); - } - count_ += num_frames; - stats_sum_.AddVec(num_frames, mean, 1.0); - stats_sumsq_.AddVec(num_frames, uvar, 1.0); -} - -void BatchNormComponent::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &block_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &epsilon_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &target_rms_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &test_mode_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &count_); - ExpectToken(is, binary, ""); - stats_sum_.Read(is, binary); - ExpectToken(is, binary, ""); - stats_sumsq_.Read(is, binary); - stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0); - stats_sum_.Scale(count_); - stats_sumsq_.Scale(count_); - ExpectToken(is, binary, ""); - ComputeDerived(); - Check(); -} - -void BatchNormComponent::Write(std::ostream &os, bool binary) const { - Check(); - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, block_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, epsilon_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, target_rms_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, test_mode_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, count_); - CuVector mean(stats_sum_), var(stats_sumsq_); - if (count_ != 0) { - mean.Scale(1.0 / count_); - var.Scale(1.0 / count_); - var.AddVecVec(-1.0, mean, mean, 1.0); - } - WriteToken(os, binary, ""); - mean.Write(os, binary); - WriteToken(os, binary, ""); - var.Write(os, binary); - WriteToken(os, binary, ""); -} - -void BatchNormComponent::Scale(BaseFloat scale) { - if (scale == 0) { - count_ = 0.0; - stats_sum_.SetZero(); - stats_sumsq_.SetZero(); - } else { - count_ *= scale; - stats_sum_.Scale(scale); - stats_sumsq_.Scale(scale); - } -} - - -void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) { - const BatchNormComponent *other = - dynamic_cast(&other_in); - count_ += alpha * other->count_; - stats_sum_.AddVec(alpha, other->stats_sum_); - stats_sumsq_.AddVec(alpha, other->stats_sumsq_); - // this operation might change offset_ and scale_, so we recompute them - // in this instance (but not in Scale()). - ComputeDerived(); -} - -void BatchNormComponent::ZeroStats() { - // We only zero the stats if we're not in test mode. In test mode, this would - // be dangerous as the stats are the source for the transform, and zeroing - // them and then calling ComputeDerived() again would remove the transform - // parameters (offset_ and scale_). - if (!test_mode_) { - count_ = 0.0; - stats_sum_.SetZero(); - stats_sumsq_.SetZero(); - } -} - - SumBlockComponent::SumBlockComponent(const SumBlockComponent &other): input_dim_(other.input_dim_), output_dim_(other.output_dim_), scale_(other.scale_) { } diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index d7cece06284..2e0965d25e9 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -40,6 +40,9 @@ namespace nnet3 { /// output for one input, and return the kSimpleComponent flag in their /// Properties(): for example, tanh and affine components. In /// nnet-general-component.h there are components that don't fit this pattern. +/// +/// Some components that do provide the kSimpleComponent flag are not declared +/// here: see also nnet-normalize-component.h. // This "nnet3" version of the p-norm component only supports the 2-norm. class PnormComponent: public Component { @@ -186,82 +189,6 @@ class ElementwiseProductComponent: public Component { int32 output_dim_; }; -/* - Implements the function: - - y = x * (sqrt(dim(x)) * target-rms) / |x| - - where |x| is the 2-norm of the vector x. I.e. its output is its input - scaled such that the root-mean-square values of its elements equals - target-rms. (As a special case, if the input is zero, it outputs zero). - - Note: if you specify add-log-stddev=true, it adds an extra element to - y which equals log(|x| / sqrt(dim(x))). - - - Configuration values accepted: - dim, or input-dim Input dimension of this component, e.g. 1024. - Will be the same as the output dimension if add-log-stddev=false. - block-dim Defaults to 'dim' you may specify a nonzero divisor - of 'dim'. In this case the input dimension will - be interpreted as blocks of dimension 'block-dim' - to which the nonlinearity described above is applied - separately. - add-log-stddev You can set this to true to add an extra output - dimension which will equal |x| / sqrt(dim(x)). - If block-dim is specified, this is done per block. - target-rms This defaults to 1.0, but if set it to another - (nonzero) value, the output will be scaled by this - factor. - */ -class NormalizeComponent: public Component { - public: - explicit NormalizeComponent(const NormalizeComponent &other); - - virtual int32 Properties() const { - return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds| - (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) | - (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0); - } - NormalizeComponent() { } - virtual std::string Type() const { return "NormalizeComponent"; } - virtual void InitFromConfig(ConfigLine *cfl); - virtual Component* Copy() const { return new NormalizeComponent(*this); } - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &, // out_value - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); - virtual void Write(std::ostream &os, bool binary) const; - virtual int32 InputDim() const { return input_dim_; } - virtual int32 OutputDim() const { - return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0)); - } - virtual std::string Info() const; - private: - NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow. - enum { kExpSquaredNormFloor = -66 }; - // kSquaredNormFloor is about 0.7e-20. We need a value that's exactly representable in - // float and whose inverse square root is also exactly representable - // in float (hence, an even power of two). - static const BaseFloat kSquaredNormFloor; - int32 input_dim_; - int32 block_dim_; - BaseFloat target_rms_; // The target rms for outputs, default 1.0. - - bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D))) - // is an extra dimension of the output. -}; - - /* Implements the sigmoid nonlinearity, i.e. the function y = exp(-x). @@ -2384,183 +2311,6 @@ class MaxpoolingComponent: public Component { }; -/* - BatchNormComponent - - This implements batch normalization; for each dimension of the - input it normalizes the data to be zero-mean, unit-variance. You - can set the block-dim configuration value to implement spatial - batch normalization, see the comment for the variable. - - If you want to combine this with the trainable offset and scale that the - original BatchNorm paper used, then follow this by the - ScaleAndOffsetComponent. - - It's a simple component (uses the kSimpleComponent flag), but it is unusual in - that it will give different results if you call it on half the matrix at a - time. Most of the time this would be pretty harmless, so we still return the - kSimpleComponent flag. We may have to modify the test code a little to - account for this, or possibly remove the kSimpleComponent flag. In some sense - each output Index depends on every input Index, but putting those dependencies - explicitly into the dependency-tracking framework as a GeneralComponent - would be very impractical and might lead to a lot of unnecessary things being - computed. You have to be a bit careful where you put this component, and understand - what you're doing e.g. putting it in the path of a recurrence is a bit problematic - if the minibatch size is small. - - Accepted configuration values: - dim Dimension of the input and output - block-dim Defaults to 'dim', but may be set to a nonzero divisor - of 'dim'. In this case, each block of dimension 'block-dim' - is treated like a separate row of the input matrix, which - means that the stats from n'th element of each - block are pooled into one class, for each n.a - epsilon Small term added to the variance that is used to prevent - division by zero - target-rms This defaults to 1.0, but if set, for instance, to 2.0, - it will normalize the standard deviation of the output to - 2.0. 'target-stddev' might be a more suitable name, but this - was chosen for consistency with NormalizeComponent. - */ -class BatchNormComponent: public Component { - public: - - BatchNormComponent() { } - - // call this with 'true' to set 'test mode' where the batch normalization is - // done with stored stats. There won't normally be any need to specially - // accumulate these stats; they are stored as a matter of course on each - // iteration of training, as for NonlinearComponents, and we'll use the stats - // from the most recent [script-level] iteration. - void SetTestMode(bool test_mode); - - // constructor using another component - BatchNormComponent(const BatchNormComponent &other); - - virtual int32 InputDim() const { return dim_; } - virtual int32 OutputDim() const { return dim_; } - - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - virtual std::string Type() const { return "BatchNormComponent"; } - virtual int32 Properties() const { - // If the block-dim is less than the dim, we need the input and output - // matrices to be contiguous (stride==num-cols), as we'll be reshaping - // internally. This is not much of a cost, because this will be used - // in convnets where we have to do this anyway. - return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace| - kBackpropInPlace| - (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)| - (test_mode_ ? 0 : kUsesMemo|kStoresStats); - } - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *, // to_update, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); // This Read function - // requires that the Component has the correct type. - - /// Write component to stream - virtual void Write(std::ostream &os, bool binary) const; - virtual Component* Copy() const { return new BatchNormComponent(*this); } - - virtual void Scale(BaseFloat scale); - virtual void Add(BaseFloat alpha, const Component &other); - virtual void ZeroStats(); - - - virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } - - virtual void StoreStats(const CuMatrixBase &in_value, - const CuMatrixBase &out_value, - void *memo); - - // Members specific to this component type. - // Note: the offset and scale will only be nonempty in 'test mode'. - const CuVector &Offset() const { return offset_; } - const CuVector &Scale() const { return scale_; } - - private: - - struct Memo { - // number of frames (after any reshaping). - int32 num_frames; - // 'sum_sumsq_scale' is of dimension 4 by block_dim_: - // Row 0 = mean = the mean of the rows of the input - // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). - // Row 2 = scale = the scale of the renormalization, which is - // Row 3 is used as a temporary in Backprop. - // the inverse stddev of the input (modified by epsilon_, - // see the Propagate function. - CuMatrix mean_uvar_scale; - }; - - void Check() const; - - // this function is used in a couple of places; it turns the raw stats into - // the offset/scale term of a normalizing transform. - static void ComputeOffsetAndScale(double count, - BaseFloat epsilon, - const Vector &stats_sum, - const Vector &stats_sumsq, - Vector *offset, - Vector *scale); - // computes derived parameters offset_ and scale_. - void ComputeDerived(); - - // Dimension of the input and output. - int32 dim_; - // This would normally be the same as dim_, but if it's less (and it must be > - // 0 and must divide dim_), then each separate block of the input of dimension - // 'block_dim_' is treated like a separate frame for the purposes of - // normalization. This can be used to implement spatial batch normalization - // for convolutional setups-- assuming the filter-dim has stride 1, which it - // always will in the new code in nnet-convolutional-component.h. - int32 block_dim_; - - // Used to avoid exact-zero variances, epsilon has the dimension of a - // covariance. - BaseFloat epsilon_; - - // This value will normally be 1.0, which is the default, but you can set it - // to other values as a way to control how fast the following layer learns - // (smaller -> slower). The same config exists in NormalizeComponent. - BaseFloat target_rms_; - - // This is true if we want the batch normalization to operate in 'test mode' - // meaning the data mean and stddev used for the normalization are fixed - // quantities based on previously accumulated stats. Note: the stats we use - // for this are based on the same 'StoreStats' mechanism as we use for - // components like SigmoidComponent and ReluComponent; we'll be using - // the stats from the most recent [script-level] iteration of training. - bool test_mode_; - - - // total count of stats stored by StoreStats(). - double count_; - // sum-of-data component of stats of input data. - CuVector stats_sum_; - // sum-of-squared component of stats of input data. - CuVector stats_sumsq_; - - // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they - // dictate the transform that is done in 'test mode'. They are set only when - // reading the model from disk and when calling SetTestMode(true); they are - // resized to empty when the stats are updated, to ensure that out-of-date - // values are not kept around. - CuVector offset_; - CuVector scale_; -}; - - /** CompositeComponent is a component representing a sequence of [simple] components. The config line would be something like the following diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 83b902a9b90..6ed0b6f9191 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1711,6 +1711,16 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " learning-rate=" << learning_rate; break; } + /* case 35: { + *component_type = "MemoryNormComponent"; + int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2); + os << " dim=" << dim + << " block-dim=" << block_dim << " target-rms=" + << RandInt(1, 2) << " include-indirect-derivative=" + << (RandInt(0, 1) == 0 ? "true" : "false") + << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0"); + break; + }*/ default: KALDI_ERR << "Error generating random component"; } diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 64fc3003609..b000938d513 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -22,6 +22,7 @@ #include "nnet3/nnet-utils.h" #include "nnet3/nnet-graph.h" #include "nnet3/nnet-simple-component.h" +#include "nnet3/nnet-normalize-component.h" #include "nnet3/nnet-general-component.h" #include "nnet3/nnet-convolutional-component.h" #include "nnet3/nnet-parse.h" From 922fc902dd051ba101cb999a491366c8ec8b6cd0 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 10 Dec 2017 17:42:20 -0800 Subject: [PATCH 006/184] [src] Making MemoryNormComponent behave as BatchNormComponent in ScaleBatchnormStats, etc. --- src/nnet3/nnet-utils.cc | 11 ++++++++--- src/nnet3/nnet-utils.h | 9 +++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index b000938d513..7ae6bb99f09 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -492,9 +492,8 @@ void SetDropoutProportion(BaseFloat dropout_proportion, bool HasBatchnorm(const Nnet &nnet) { for (int32 c = 0; c < nnet.NumComponents(); c++) { const Component *comp = nnet.GetComponent(c); - const BatchNormComponent *bc = - dynamic_cast(comp); - if (bc != NULL) + if (dynamic_cast(comp) != NULL || + dynamic_cast(comp) != NULL) return true; } return false; @@ -510,6 +509,9 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, BatchNormComponent *bc = dynamic_cast(comp); if (bc != NULL) bc->Scale(batchnorm_stats_scale); + MemoryNormComponent *mc = dynamic_cast(comp); + if (mc != NULL) + mc->Scale(batchnorm_stats_scale); } } @@ -534,6 +536,9 @@ void SetBatchnormTestMode(bool test_mode, Nnet *nnet) { BatchNormComponent *bc = dynamic_cast(comp); if (bc != NULL) bc->SetTestMode(test_mode); + MemoryNormComponent *mc = dynamic_cast(comp); + if (mc != NULL) + mc->SetTestMode(test_mode); } } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index d961b7cb6a0..b44b16b3606 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -169,10 +169,11 @@ void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); /// Returns true if nnet has at least one component of type -/// BatchNormComponent. +/// BatchNormComponent or MemoryNormComponent bool HasBatchnorm(const Nnet &nnet); -/// This function affects only components of type BatchNormComponent. +/// This function affects only components of type BatchNormComponent or +/// MemoryNormComponent. /// It sets "test mode" on such components (if you call it with test_mode = /// true, otherwise it would set normal mode, but this wouldn't be needed /// often). "test mode" means that instead of using statistics from the batch, @@ -445,8 +446,8 @@ void ApplyL2Regularization(const Nnet &nnet, /** This function scales the batchorm stats of any batchnorm components - (components of type BatchNormComponent) in 'nnet' by the scale - 'batchnorm_stats_scale'. + (components of type BatchNormComponent or MemoryNormComponent) in 'nnet' by + the scale 'batchnorm_stats_scale'. */ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, Nnet *nnet); From f400999a64143b67d88df0ba03f2b2b1d5a20772 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 10 Dec 2017 17:53:05 -0800 Subject: [PATCH 007/184] [src] Bug-fix in MemoryNormComponent --- src/nnet3/nnet-normalize-component.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index ac3817adfbe..ef6d7f4a6f3 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -1113,6 +1113,7 @@ void MemoryNormComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &backward_count_); ExpectToken(is, binary, ""); data_.Read(is, binary); + ExpectToken(is, binary, ""); Check(); } From 5bb3870c75093360e21f57de2e55c0d8acbc76a2 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 10 Dec 2017 19:43:29 -0800 Subject: [PATCH 008/184] [src] Fix bug in MemoryNormComponent --- src/nnet3/nnet-normalize-component.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index ef6d7f4a6f3..62a8afc6472 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -867,15 +867,20 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, if (test_mode_) { if (stats_count_ <= 0.0) KALDI_ERR << "Test mode set but no stats available."; - CuSubVector x_mean(data_, 3), scale(data_, 4); + CuSubVector x_mean(data_, 0), scale(data_, 4); out->AddVecToRows(-1.0, x_mean); out->MulColsVec(scale); return NULL; } else { Memo *memo = GetMemo(in); - CuSubVector x_sum(memo->data, 0), - scale(memo->data, 2); - out->AddVecToRows(-1.0 / memo->num_frames, x_sum); + if (stats_count_ <= 0.0) { + CuSubVector x_sum(memo->data, 0); + out->AddVecToRows(-1.0 / memo->num_frames, x_sum); + } else { // use the mean stored with this object. + CuSubVector x_mean(data_, 0); + out->AddVecToRows(-1.0, x_mean); + } + CuSubVector scale(memo->data, 2); out->MulColsVec(scale); return memo; } From 7e9cc29b0a9edb3c6e952c5997d61d3aafac1e2f Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 10 Dec 2017 19:58:22 -0800 Subject: [PATCH 009/184] [src] Some reorganizations of MemoryNormComponent code --- src/nnet3/nnet-normalize-component.cc | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index 62a8afc6472..7eca8594748 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -862,28 +862,26 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, if (out->Data() != in.Data()) out->CopyFromMat(in); + if (test_mode_ && stats_count_ <= 0.0) + KALDI_ERR << "Test mode set but no stats available."; + // From this point, we can assume that the num-cols of 'in' and 'out' // equals block_dim_. - if (test_mode_) { - if (stats_count_ <= 0.0) - KALDI_ERR << "Test mode set but no stats available."; + Memo *ans = NULL; + if (!test_mode_) + ans = GetMemo(in); + + if (test_mode_ || stats_count_ > 0.0) { CuSubVector x_mean(data_, 0), scale(data_, 4); out->AddVecToRows(-1.0, x_mean); out->MulColsVec(scale); - return NULL; } else { - Memo *memo = GetMemo(in); - if (stats_count_ <= 0.0) { - CuSubVector x_sum(memo->data, 0); - out->AddVecToRows(-1.0 / memo->num_frames, x_sum); - } else { // use the mean stored with this object. - CuSubVector x_mean(data_, 0); - out->AddVecToRows(-1.0, x_mean); - } - CuSubVector scale(memo->data, 2); + CuSubVector x_sum(memo->data, 0), + scale(memo->data, 2); + out->AddVecToRows(-1.0 / memo->num_frames, x_sum); out->MulColsVec(scale); - return memo; } + return memo; } From 39e6f777524b3270c1604c8134fd1d37362ca1f0 Mon Sep 17 00:00:00 2001 From: freewym Date: Mon, 11 Dec 2017 17:58:40 -0500 Subject: [PATCH 010/184] changes according to the review --- src/chainbin/nnet3-chain-combine.cc | 133 +++--- src/nnet3/nnet-chain-combine.cc | 610 ---------------------------- src/nnet3/nnet-chain-combine.h | 205 ---------- src/nnet3/nnet-combine.cc | 606 --------------------------- src/nnet3/nnet-combine.h | 251 ------------ src/nnet3bin/nnet3-combine.cc | 132 +++--- 6 files changed, 152 insertions(+), 1785 deletions(-) delete mode 100644 src/nnet3/nnet-chain-combine.cc delete mode 100644 src/nnet3/nnet-chain-combine.h delete mode 100644 src/nnet3/nnet-combine.cc delete mode 100644 src/nnet3/nnet-combine.h diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 7dece5cb070..520575e1d88 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -28,44 +28,54 @@ namespace kaldi { namespace nnet3 { -double ComputeObjf(const std::vector &egs, +// Computes the objective of the moving average of nnet on egs. If either of +// batchnorm/dropout test modes is true, we make a copy of the moving average, +// set test modes on that and evaluate its objective. Note: the object that +// prob_computer->nnet_ refers to should be moving_average_nnet. +double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, + const std::vector &egs, + const Nnet &moving_average_nnet, + const chain::ChainTrainingOptions &chain_config, + const fst::StdVectorFst &den_fst, NnetChainComputeProb *prob_computer) { - prob_computer->Reset(); - std::vector::const_iterator iter = egs.begin(), - end = egs.end(); - for (; iter != end; ++iter) - prob_computer->Compute(*iter); - const ChainObjectiveInfo *objf_info = - prob_computer->GetObjective("output"); - if (objf_info == NULL) - KALDI_ERR << "Error getting objective info (unsuitable egs?)"; - KALDI_ASSERT(objf_info->tot_weight > 0.0); - // we prefer to deal with normalized objective functions. - return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight; + if (batchnorm_test_mode || dropout_test_mode) { + Nnet moving_average_nnet_copy(moving_average_nnet); + if (batchnorm_test_mode) + SetBatchnormTestMode(true, &moving_average_nnet_copy); + if (dropout_test_mode) + SetDropoutTestMode(true, &moving_average_nnet_copy); + NnetComputeProbOptions compute_prob_opts; + NnetChainComputeProb prob_computer_test(compute_prob_opts, chain_config, + den_fst, moving_average_nnet_copy); + return ComputeObjf(false, false, egs, moving_average_nnet_copy, + chain_config, den_fst, &prob_computer_test); + } else { + prob_computer->Reset(); + std::vector::const_iterator iter = egs.begin(), + end = egs.end(); + for (; iter != end; ++iter) + prob_computer->Compute(*iter); + const ChainObjectiveInfo *objf_info = + prob_computer->GetObjective("output"); + if (objf_info == NULL) + KALDI_ERR << "Error getting objective info (unsuitable egs?)"; + KALDI_ASSERT(objf_info->tot_weight > 0.0); + // inf/nan tot_objf->return -inf objective. + double tot_objf = objf_info->tot_like + objf_info->tot_l2_term; + if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0)) + return -std::numeric_limits::infinity(); + // we prefer to deal with normalized objective functions. + return tot_objf / objf_info->tot_weight; + } } -// Note: the object that prob_computer.nnet_ refers to should be -// *moving_average_nnet. -double UpdateNnetMovingAverageAndComputeObjf(int32 num_models, - const std::vector &egs, - const Nnet &nnet, Nnet *moving_average_nnet, - NnetChainComputeProb *prob_computer) { - int32 num_params = NumParameters(nnet); - KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet)); - Vector nnet_params(num_params, kUndefined), - moving_average_nnet_params(num_params, kUndefined); - VectorizeNnet(nnet, &nnet_params); - VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params); - moving_average_nnet_params.Scale((num_models - 1.0) / num_models); - moving_average_nnet_params.AddVec(1.0 / num_models, nnet_params); - - BaseFloat sum = moving_average_nnet_params.Sum(); - // inf/nan parameters->return -inf objective. - if (!(sum == sum && sum - sum == 0)) - return -std::numeric_limits::infinity(); - - UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet); - return ComputeObjf(egs, prob_computer); +// Updates moving average over num_models nnets, given the average over +// previous (num_models - 1) nnets, and the new nnet. +void UpdateNnetMovingAverage(int32 num_models, + const Nnet &nnet, Nnet *moving_average_nnet) { + KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet)); + ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet); + AddNnet(nnet, 1.0 / num_models, moving_average_nnet); } } @@ -93,6 +103,7 @@ int main(int argc, char *argv[]) { " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; bool binary_write = true; + int32 max_objective_evaluations = 30; bool batchnorm_test_mode = false, dropout_test_mode = true; std::string use_gpu = "yes"; @@ -100,13 +111,19 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("max-objective-evaluations", &max_objective_evaluations, "Max " + "number of objective evaluations in order to figure out the " + "best number of models to combine. It helps to speedup if " + "the number of models provided to this binary is quite large " + "(e.g. several hundred)."); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); po.Register("batchnorm-test-mode", &batchnorm_test_mode, - "If true, set test-mode to true on any BatchNormComponents."); + "If true, set test-mode to true on any BatchNormComponents " + "while evaluating objectives."); po.Register("dropout-test-mode", &dropout_test_mode, "If true, set test-mode to true on any DropoutComponents and " - "DropoutMaskComponents."); + "DropoutMaskComponents while evaluating objectives."); chain_config.Register(&po); @@ -135,13 +152,8 @@ int main(int argc, char *argv[]) { ReadKaldiObject(raw_nnet_rxfilename, &nnet); Nnet moving_average_nnet(nnet), best_nnet(nnet); NnetComputeProbOptions compute_prob_opts; - NnetChainComputeProb *prob_computer = new NnetChainComputeProb( - compute_prob_opts, chain_config, den_fst, moving_average_nnet); - - if (batchnorm_test_mode) - SetBatchnormTestMode(true, &nnet); - if (dropout_test_mode) - SetDropoutTestMode(true, &nnet); + NnetChainComputeProb prob_computer(compute_prob_opts, chain_config, + den_fst, moving_average_nnet); std::vector egs; egs.reserve(10000); // reserve a lot of space to minimize the chance of @@ -156,26 +168,35 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(!egs.empty()); } - int32 best_n = 1; - double best_objf = ComputeObjf(egs, prob_computer); + // first evaluates the objective using the last model. + int32 best_num_to_combine = 1; + double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, chain_config, den_fst, &prob_computer); KALDI_LOG << "objective function using the last model is " << best_objf; int32 num_nnets = po.NumArgs() - 3; - + // then each time before we re-evaluate the objective function, we will add + // num_to_add models to the moving average. + int32 num_to_add = (num_nnets + max_objective_evaluations - 1) / + max_objective_evaluations; for (int32 n = 1; n < num_nnets; n++) { std::string this_nnet_rxfilename = po.GetArg(n + 2); ReadKaldiObject(this_nnet_rxfilename, &nnet); - double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet, - &moving_average_nnet, prob_computer); - KALDI_LOG << "Combining last " << n + 1 - << " models, objective function is " << objf; - if (objf > best_objf) { - best_objf = objf; - best_nnet = moving_average_nnet; - best_n = n + 1; + // updates the moving average + UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet); + if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) { + double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, chain_config, den_fst, &prob_computer); + KALDI_LOG << "Combining last " << n + 1 + << " models, objective function is " << objf; + if (objf > best_objf) { + best_objf = objf; + best_nnet = moving_average_nnet; + best_num_to_combine = n + 1; + } } } - KALDI_LOG << "Using the model averaged over last " << best_n + KALDI_LOG << "Using the model averaged over last " << best_num_to_combine << " models, objective function is " << best_objf; if (HasBatchnorm(nnet)) diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc deleted file mode 100644 index c93858fb06e..00000000000 --- a/src/nnet3/nnet-chain-combine.cc +++ /dev/null @@ -1,610 +0,0 @@ -// nnet3/nnet-chain-combine.cc - -// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "nnet3/nnet-chain-combine.h" -#include "nnet3/nnet-utils.h" - -namespace kaldi { -namespace nnet3 { - -NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config, - const chain::ChainTrainingOptions &chain_config, - int32 num_nnets, - const std::vector &egs, - const fst::StdVectorFst &den_fst, - const Nnet &first_nnet): - combine_config_(combine_config), - chain_config_(chain_config), - egs_(egs), - den_fst_(den_fst), - nnet_(first_nnet), - num_real_input_nnets_(num_nnets), - nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs), - NumParameters(first_nnet)), - tot_input_weighting_(nnet_params_.NumRows()) { - - if (combine_config_.sum_to_one_penalty != 0.0 && - combine_config_.enforce_sum_to_one) { - KALDI_WARN << "--sum-to-one-penalty=" << combine_config_.sum_to_one_penalty - << " is nonzero, so setting --enforce-sum-to-one=false."; - combine_config_.enforce_sum_to_one = false; - } - SubVector first_params(nnet_params_, 0); - VectorizeNnet(nnet_, &first_params); - tot_input_weighting_(0) += 1.0; - num_nnets_provided_ = 1; - ComputeUpdatableComponentDims(); - NnetComputeProbOptions compute_prob_opts; - compute_prob_opts.compute_deriv = true; - prob_computer_ = new NnetChainComputeProb(compute_prob_opts, chain_config_, den_fst_, nnet_); -} - -void NnetChainCombiner::ComputeUpdatableComponentDims(){ - updatable_component_dims_.clear(); - for (int32 c = 0; c < nnet_.NumComponents(); c++) { - Component *comp = nnet_.GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - // For now all updatable components inherit from class UpdatableComponent. - // If that changes in future, we will change this code. - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - "UpdatableComponent; change this code."; - updatable_component_dims_.push_back(uc->NumParameters()); - } - } -} - -void NnetChainCombiner::AcceptNnet(const Nnet &nnet) { - KALDI_ASSERT(num_nnets_provided_ < num_real_input_nnets_ && - "You called AcceptNnet too many times."); - int32 num_effective_nnets = nnet_params_.NumRows(); - if (num_effective_nnets == num_real_input_nnets_) { - SubVector this_params(nnet_params_, num_nnets_provided_); - VectorizeNnet(nnet, &this_params); - tot_input_weighting_(num_nnets_provided_) += 1.0; - } else { - // this_index is a kind of warped index, mapping the range - // 0 ... num_real_inputs_nnets_ - 1 onto the range - // 0 ... num_effective_nnets - 1. View the index as falling in - // between two integer indexes and determining weighting factors. - // we could view this as triangular bins. - BaseFloat this_index = num_nnets_provided_ * (num_effective_nnets - 1) - / static_cast(num_real_input_nnets_ - 1); - int32 lower_index = std::floor(this_index), - upper_index = lower_index + 1; - BaseFloat remaining_part = this_index - lower_index, - lower_weight = 1.0 - remaining_part, - upper_weight = remaining_part; - KALDI_ASSERT(lower_index >= 0 && upper_index <= num_effective_nnets && - lower_weight >= 0.0 && upper_weight >= 0.0 && - lower_weight <= 1.0 && upper_weight <= 1.0); - Vector vec(nnet_params_.NumCols(), kUndefined); - VectorizeNnet(nnet, &vec); - nnet_params_.Row(lower_index).AddVec(lower_weight, vec); - tot_input_weighting_(lower_index) += lower_weight; - if (upper_index == num_effective_nnets) { - KALDI_ASSERT(upper_weight < 0.1); - } else { - nnet_params_.Row(upper_index).AddVec(upper_weight, vec); - tot_input_weighting_(upper_index) += upper_weight; - } - } - num_nnets_provided_++; -} - -void NnetChainCombiner::FinishPreprocessingInput() { - KALDI_ASSERT(num_nnets_provided_ == num_real_input_nnets_ && - "You did not call AcceptInput() enough times."); - int32 num_effective_nnets = nnet_params_.NumRows(); - for (int32 i = 0; i < num_effective_nnets; i++) { - BaseFloat tot_weight = tot_input_weighting_(i); - KALDI_ASSERT(tot_weight > 0.0); // Or would be a coding error. - // Rescale so this row is like a weighted average instead of - // a weighted sum. - if (tot_weight != 1.0) - nnet_params_.Row(i).Scale(1.0 / tot_weight); - } -} - -void NnetChainCombiner::Combine() { - FinishPreprocessingInput(); - - if (!SelfTestDerivatives()) { - KALDI_LOG << "Self-testing model derivatives since parameter-derivatives " - "self-test failed."; - SelfTestModelDerivatives(); - } - - int32 dim = ParameterDim(); - LbfgsOptions lbfgs_options; - lbfgs_options.minimize = false; // We're maximizing. - lbfgs_options.m = dim; // Store the same number of vectors as the dimension - // itself, so this is BFGS. - lbfgs_options.first_step_impr = combine_config_.initial_impr; - - Vector params(dim), deriv(dim); - double objf, initial_objf; - GetInitialParameters(¶ms); - - - OptimizeLbfgs lbfgs(params, lbfgs_options); - - for (int32 i = 0; i < combine_config_.num_iters; i++) { - params.CopyFromVec(lbfgs.GetProposedValue()); - objf = ComputeObjfAndDerivFromParameters(params, &deriv); - KALDI_VLOG(2) << "Iteration " << i << " params = " << params - << ", objf = " << objf << ", deriv = " << deriv; - if (i == 0) initial_objf = objf; - lbfgs.DoStep(objf, deriv); - } - - if (!combine_config_.sum_to_one_penalty) { - KALDI_LOG << "Combining nnets, objective function changed from " - << initial_objf << " to " << objf; - } else { - Vector weights(WeightDim()); - GetWeights(params, &weights); - bool print_weights = true; - double penalty = GetSumToOnePenalty(weights, NULL, print_weights); - // note: initial_objf has no penalty term because it summed exactly - // to one. - KALDI_LOG << "Combining nnets, objective function changed from " - << initial_objf << " to " << objf << " = " - << (objf - penalty) << " + " << penalty; - } - - - // must recompute nnet_ if "params" is not exactly equal to the - // final params that LB - Vector final_params(dim); - final_params.CopyFromVec(lbfgs.GetValue(&objf)); - if (!params.ApproxEqual(final_params, 0.0)) { - // the following call makes sure that nnet_ corresponds to the parameters - // in "params". - ComputeObjfAndDerivFromParameters(final_params, &deriv); - } - PrintParams(final_params); -} - - -void NnetChainCombiner::PrintParams(const VectorBase ¶ms) const { - Vector weights(WeightDim()), normalized_weights(WeightDim()); - GetWeights(params, &weights); - GetNormalizedWeights(weights, &normalized_weights); - int32 num_models = nnet_params_.NumRows(), - num_uc = NumUpdatableComponents(); - - if (combine_config_.separate_weights_per_component) { - std::vector updatable_component_names; - for (int32 c = 0; c < nnet_.NumComponents(); c++) { - const Component *comp = nnet_.GetComponent(c); - if (comp->Properties() & kUpdatableComponent) - updatable_component_names.push_back(nnet_.GetComponentName(c)); - } - KALDI_ASSERT(static_cast(updatable_component_names.size()) == - NumUpdatableComponents()); - for (int32 uc = 0; uc < num_uc; uc++) { - std::ostringstream os; - os.width(20); - os << std::left << updatable_component_names[uc] << ": "; - os.width(9); - os.precision(4); - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + uc; - os << " " << std::left << normalized_weights(index); - } - KALDI_LOG << "Weights for " << os.str(); - } - } else { - int32 c = 0; // arbitrarily chosen; they'll all be the same. - std::ostringstream os; - os.width(9); - os.precision(4); - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - os << " " << std::left << normalized_weights(index); - } - KALDI_LOG << "Model weights are: " << os.str(); - } - int32 num_effective_nnets = nnet_params_.NumRows(); - if (num_effective_nnets != num_real_input_nnets_) - KALDI_LOG << "Above, only " << num_effective_nnets << " weights were " - "printed due to the the --num-effective-nnets option; " - "there were " << num_real_input_nnets_ << " actual input nnets. " - "Each weight corresponds to a weighted average over a range of " - "nnets in the sequence (with triangular bins)"; -} - -bool NnetChainCombiner::SelfTestDerivatives() { - int32 num_tests = 2; // more properly, this is the number of dimensions in a - // single test. - double delta = 0.001; - int32 dim = ParameterDim(); - - Vector params(dim), deriv(dim); - Vector predicted_changes(num_tests), - observed_changes(num_tests); - - GetInitialParameters(¶ms); - double initial_objf = ComputeObjfAndDerivFromParameters(params, - &deriv); - for (int32 i = 0; i < num_tests; i++) { - Vector new_deriv(dim), offset(dim), new_params(params); - offset.SetRandn(); - new_params.AddVec(delta, offset); - double new_objf = ComputeObjfAndDerivFromParameters(new_params, - &new_deriv); - // for predicted changes, interpolate old and new derivs. - predicted_changes(i) = - 0.5 * VecVec(new_params, deriv) - 0.5 * VecVec(params, deriv) + - 0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv); - observed_changes(i) = new_objf - initial_objf; - } - double threshold = 0.1; - KALDI_LOG << "predicted_changes = " << predicted_changes; - KALDI_LOG << "observed_changes = " << observed_changes; - if (!ApproxEqual(predicted_changes, observed_changes, threshold)) { - KALDI_WARN << "Derivatives self-test failed."; - return false; - } else { - return true; - } -} - - -void NnetChainCombiner::SelfTestModelDerivatives() { - int32 num_tests = 3; // more properly, this is the number of dimensions in a - // single test. - int32 dim = ParameterDim(); - - Vector params(dim), deriv(dim); - Vector predicted_changes(num_tests), - observed_changes(num_tests); - - GetInitialParameters(¶ms); - Vector weights(WeightDim()), normalized_weights(WeightDim()); - Vector nnet_params(NnetParameterDim(), kUndefined), - nnet_deriv(NnetParameterDim(), kUndefined); - GetWeights(params, &weights); - GetNormalizedWeights(weights, &normalized_weights); - GetNnetParameters(normalized_weights, &nnet_params); - - double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params, - &nnet_deriv); - - double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) / - NnetParameterDim()); - - - for (int32 i = 0; i < num_tests; i++) { - Vector new_nnet_deriv(NnetParameterDim()), - offset(NnetParameterDim()), new_nnet_params(nnet_params); - offset.SetRandn(); - new_nnet_params.AddVec(delta, offset); - double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params, - &new_nnet_deriv); - // for predicted changes, interpolate old and new derivs. - predicted_changes(i) = - 0.5 * VecVec(new_nnet_params, nnet_deriv) - - 0.5 * VecVec(nnet_params, nnet_deriv) + - 0.5 * VecVec(new_nnet_params, new_nnet_deriv) - - 0.5 * VecVec(nnet_params, new_nnet_deriv); - observed_changes(i) = new_objf - initial_objf; - } - double threshold = 0.1; - KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes; - KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes; - if (!ApproxEqual(predicted_changes, observed_changes, threshold)) - KALDI_WARN << "Model derivatives self-test failed."; -} - - - - -int32 NnetChainCombiner::ParameterDim() const { - if (combine_config_.separate_weights_per_component) - return NumUpdatableComponents() * nnet_params_.NumRows(); - else - return nnet_params_.NumRows(); -} - - -void NnetChainCombiner::GetInitialParameters(VectorBase *params) const { - KALDI_ASSERT(params->Dim() == ParameterDim()); - params->Set(1.0 / nnet_params_.NumRows()); - if (combine_config_.enforce_positive_weights) { - // we enforce positive weights by treating the params as the log of the - // actual weight. - params->ApplyLog(); - } -} - -void NnetChainCombiner::GetWeights(const VectorBase ¶ms, - VectorBase *weights) const { - KALDI_ASSERT(weights->Dim() == WeightDim()); - if (combine_config_.separate_weights_per_component) { - weights->CopyFromVec(params); - } else { - int32 nc = NumUpdatableComponents(); - // have one parameter per row of nnet_params_, and need to repeat - // the weight for the different components. - for (int32 n = 0; n < nnet_params_.NumRows(); n++) { - for (int32 c = 0; c < nc; c++) - (*weights)(n * nc + c) = params(n); - } - } - // we enforce positive weights by having the weights be the exponential of the - // corresponding parameters. - if (combine_config_.enforce_positive_weights) - weights->ApplyExp(); -} - - -void NnetChainCombiner::GetParamsDeriv(const VectorBase &weights, - const VectorBase &weights_deriv, - VectorBase *param_deriv) { - KALDI_ASSERT(weights.Dim() == WeightDim() && - param_deriv->Dim() == ParameterDim()); - Vector preexp_weights_deriv(weights_deriv); - if (combine_config_.enforce_positive_weights) { - // to enforce positive weights we first compute weights (call these - // preexp_weights) and then take exponential. Note, d/dx exp(x) = exp(x). - // So the derivative w.r.t. the preexp_weights equals the derivative - // w.r.t. the weights, times the weights. - preexp_weights_deriv.MulElements(weights); - } - if (combine_config_.separate_weights_per_component) { - param_deriv->CopyFromVec(preexp_weights_deriv); - } else { - int32 nc = NumUpdatableComponents(); - param_deriv->SetZero(); - for (int32 n = 0; n < nnet_params_.NumRows(); n++) - for (int32 c = 0; c < nc; c++) - (*param_deriv)(n) += preexp_weights_deriv(n * nc + c); - } -} - -double NnetChainCombiner::GetSumToOnePenalty( - const VectorBase &weights, - VectorBase *weights_penalty_deriv, - bool print_weights) const { - - KALDI_ASSERT(combine_config_.sum_to_one_penalty >= 0.0); - double penalty = combine_config_.sum_to_one_penalty; - if (penalty == 0.0) { - weights_penalty_deriv->SetZero(); - return 0.0; - } - double ans = 0.0; - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - Vector tot_weights(num_uc); - std::ostringstream tot_weight_info; - for (int32 c = 0; c < num_uc; c++) { - double this_total_weight = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - double this_weight = weights(index); - this_total_weight += this_weight; - } - tot_weights(c) = this_total_weight; - // this_total_weight_deriv is the derivative of the penalty - // term w.r.t. this component's total weight. - double this_total_weight_deriv; - if (combine_config_.enforce_positive_weights) { - // if combine_config_.enforce_positive_weights is true, then we choose to - // formulate the penalty in a slightly different way.. this solves the - // problem that with the formulation in the 'else' below, if for some - // reason the total weight is << 1.0, the deriv w.r.t. the actual - // parameters gets tiny [because weight = exp(params)]. - double log_total = log(this_total_weight); - ans += -0.5 * penalty * log_total * log_total; - double log_total_deriv = -1.0 * penalty * log_total; - this_total_weight_deriv = log_total_deriv / this_total_weight; - } else { - ans += -0.5 * penalty * - (this_total_weight - 1.0) * (this_total_weight - 1.0); - this_total_weight_deriv = penalty * (1.0 - this_total_weight); - - } - if (weights_penalty_deriv != NULL) { - KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim()); - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - (*weights_penalty_deriv)(index) = this_total_weight_deriv; - } - } - } - if (print_weights) { - Vector tot_weights_float(tot_weights); - KALDI_LOG << "Total weights per component: " - << PrintVectorPerUpdatableComponent(nnet_, - tot_weights_float); - } - return ans; -} - -void NnetChainCombiner::GetNnetParameters(const Vector &weights, - VectorBase *nnet_params) const { - KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols()); - nnet_params->SetZero(); - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 m = 0; m < num_models; m++) { - const SubVector src_params(nnet_params_, m); - int32 dim_offset = 0; - for (int32 c = 0; c < num_uc; c++) { - int32 index = m * num_uc + c; - BaseFloat weight = weights(index); - int32 dim = updatable_component_dims_[c]; - const SubVector src_component_params(src_params, dim_offset, - dim); - SubVector dest_component_params(*nnet_params, dim_offset, dim); - dest_component_params.AddVec(weight, src_component_params); - dim_offset += dim; - } - KALDI_ASSERT(dim_offset == nnet_params_.NumCols()); - } -} - -// compare GetNnetParameters. -void NnetChainCombiner::GetWeightsDeriv( - const VectorBase &nnet_params_deriv, - VectorBase *weights_deriv) { - KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() && - weights_deriv->Dim() == WeightDim()); - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 m = 0; m < num_models; m++) { - const SubVector src_params(nnet_params_, m); - int32 dim_offset = 0; - for (int32 c = 0; c < num_uc; c++) { - int32 index = m * num_uc + c; - int32 dim = updatable_component_dims_[c]; - const SubVector src_component_params(src_params, dim_offset, - dim); - const SubVector component_params_deriv(nnet_params_deriv, - dim_offset, dim); - (*weights_deriv)(index) = VecVec(src_component_params, - component_params_deriv); - dim_offset += dim; - } - KALDI_ASSERT(dim_offset == nnet_params_.NumCols()); - } -} - -double NnetChainCombiner::ComputeObjfAndDerivFromNnet( - VectorBase &nnet_params, - VectorBase *nnet_params_deriv) { - BaseFloat sum = nnet_params.Sum(); - // inf/nan parameters->return -inf objective. - if (!(sum == sum && sum - sum == 0)) - return -std::numeric_limits::infinity(); - // Set nnet to have these params. - UnVectorizeNnet(nnet_params, &nnet_); - - prob_computer_->Reset(); - std::vector::const_iterator iter = egs_.begin(), - end = egs_.end(); - for (; iter != end; ++iter) - prob_computer_->Compute(*iter); - const ChainObjectiveInfo *objf_info = - prob_computer_->GetObjective("output"); - if (objf_info == NULL) - KALDI_ERR << "Error getting objective info (unsuitable egs?)"; - KALDI_ASSERT(objf_info->tot_weight > 0.0); - const Nnet &deriv = prob_computer_->GetDeriv(); - VectorizeNnet(deriv, nnet_params_deriv); - // we prefer to deal with normalized objective functions. - nnet_params_deriv->Scale(1.0 / objf_info->tot_weight); - return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight; -} - - -double NnetChainCombiner::ComputeObjfAndDerivFromParameters( - VectorBase ¶ms, - VectorBase *params_deriv) { - Vector weights(WeightDim()), normalized_weights(WeightDim()), - weights_sum_to_one_penalty_deriv(WeightDim()), - normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim()); - Vector - nnet_params(NnetParameterDim(), kUndefined), - nnet_params_deriv(NnetParameterDim(), kUndefined); - GetWeights(params, &weights); - double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv); - GetNormalizedWeights(weights, &normalized_weights); - GetNnetParameters(normalized_weights, &nnet_params); - ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv); - if (ans != ans || ans - ans != 0) // NaN or inf - return ans; // No point computing derivative - GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv); - GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv, - &weights_deriv); - weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv); - GetParamsDeriv(weights, weights_deriv, params_deriv); - return ans; -} - - -// enforces the constraint that the weights for each component must sum to one, -// if necessary. -void NnetChainCombiner::GetNormalizedWeights( - const VectorBase &unnorm_weights, - VectorBase *norm_weights) const { - if (!combine_config_.enforce_sum_to_one) { - norm_weights->CopyFromVec(unnorm_weights); - return; - } - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 c = 0; c < num_uc; c++) { - double sum = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - sum += unnorm_weights(index); - } - double inv_sum = 1.0 / sum; // if it's NaN then it's OK, we'll get NaN - // weights and eventually -inf objective. - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - (*norm_weights)(index) = unnorm_weights(index) * inv_sum; - } - } -} - -void NnetChainCombiner::GetUnnormalizedWeightsDeriv( - const VectorBase &unnorm_weights, - const VectorBase &norm_weights_deriv, - VectorBase *unnorm_weights_deriv) { - if (!combine_config_.enforce_sum_to_one) { - unnorm_weights_deriv->CopyFromVec(norm_weights_deriv); - return; - } - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 c = 0; c < num_uc; c++) { - double sum = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - sum += unnorm_weights(index); - } - double inv_sum = 1.0 / sum; - double inv_sum_deriv = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - // in the forward direction, we'd do: - // (*norm_weights)(index) = unnorm_weights(index) * inv_sum; - (*unnorm_weights_deriv)(index) = inv_sum * norm_weights_deriv(index); - inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index); - } - // note: d/dx (1/x) = -1/x^2 - double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - (*unnorm_weights_deriv)(index) += sum_deriv; - } - } -} - - - - -} // namespace nnet3 -} // namespace kaldi diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h deleted file mode 100644 index 3aeb3882650..00000000000 --- a/src/nnet3/nnet-chain-combine.h +++ /dev/null @@ -1,205 +0,0 @@ -// nnet3/nnet-chain-combine.h - -// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_NNET3_NNET_CHAIN_COMBINE_H_ -#define KALDI_NNET3_NNET_CHAIN_COMBINE_H_ - -#include "nnet3/nnet-utils.h" -#include "nnet3/nnet-compute.h" -#include "util/parse-options.h" -#include "itf/options-itf.h" -#include "nnet3/nnet-combine.h" -#include "nnet3/nnet-chain-diagnostics.h" - - -namespace kaldi { -namespace nnet3 { - -// we re-use NnetCombineConfig from nnet-combine.h - -/* - You should use this class as follows: - - Call the constructor, giving it the egs and the first nnet. - - Call AcceptNnet to provide all the other nnets. (the nnets will - be stored in a matrix in CPU memory, to avoid filing up GPU memory). - - Call Combine() - - Get the resultant nnet with GetNnet(). - */ -class NnetChainCombiner { - public: - /// Caution: this object retains a const reference to the "egs", so don't - /// delete them until it goes out of scope. - NnetChainCombiner(const NnetCombineConfig &nnet_config, - const chain::ChainTrainingOptions &chain_config, - int32 num_nnets, - const std::vector &egs, - const fst::StdVectorFst &den_fst, - const Nnet &first_nnet); - - /// You should call this function num_nnets-1 times after calling - /// the constructor, to provide the remaining nnets. - void AcceptNnet(const Nnet &nnet); - - void Combine(); - - const Nnet &GetNnet() const { return nnet_; } - - ~NnetChainCombiner() { delete prob_computer_; } - private: - NnetCombineConfig combine_config_; - const chain::ChainTrainingOptions &chain_config_; - - const std::vector &egs_; - - const fst::StdVectorFst &den_fst_; - - Nnet nnet_; // The current neural network. - - NnetChainComputeProb *prob_computer_; - - std::vector updatable_component_dims_; // dimension of each updatable - // component. - - int32 num_real_input_nnets_; // number of actual nnet inputs. - - int32 num_nnets_provided_; // keeps track of the number of calls to AcceptNnet(). - - // nnet_params_ are the parameters of the "effective input" - // neural nets; they will often be the same as the real inputs, - // but if num_real_input_nnets_ > config_.num_effective_nnets, they - // will be weighted combinations. - Matrix nnet_params_; - - // This vector has the same dimension as nnet_params_.NumRows(), - // and helps us normalize so each row of nnet_params corresponds to - // a weighted average of its inputs (will be all ones if - // config_.max_effective_inputs >= the number of nnets provided). - Vector tot_input_weighting_; - - // returns the parameter dimension, i.e. the dimension of the parameters that - // we are optimizing. This depends on the config, the number of updatable - // components and nnet_params_.NumRows(); it will never exceed the number of - // updatable components times nnet_params_.NumRows(). - int32 ParameterDim() const; - - int32 NumUpdatableComponents() const { - return updatable_component_dims_.size(); - } - // returns the weight dimension. - int32 WeightDim() const { - return nnet_params_.NumRows() * NumUpdatableComponents(); - } - - int32 NnetParameterDim() const { return nnet_params_.NumCols(); } - - // Computes the initial parameters. The parameters are the underlying thing - // that we optimize; their dimension equals ParameterDim(). They are not the same - // thing as the nnet parameters. - void GetInitialParameters(VectorBase *params) const; - - // Tests that derivatives are accurate. Prints warning and returns false if not. - bool SelfTestDerivatives(); - - // Tests that model derivatives are accurate. Just prints warning if not. - void SelfTestModelDerivatives(); - - - // prints the parameters via logging statements. - void PrintParams(const VectorBase ¶ms) const; - - // This function computes the objective function (and its derivative, if the objective - // function is finite) at the given value of the parameters (the parameters we're optimizing, - // i.e. the combination weights; not the nnet parameters. This function calls most of the - // functions below. - double ComputeObjfAndDerivFromParameters( - VectorBase ¶ms, - VectorBase *params_deriv); - - - // Computes the weights from the parameters in a config-dependent way. The - // weight dimension is always (the number of updatable components times - // nnet_params_.NumRows()). - void GetWeights(const VectorBase ¶ms, - VectorBase *weights) const; - - // Given the raw weights: if config_.enforce_sum_to_one, then compute weights - // with sum-to-one constrint per component included; else just copy input to - // output. - void GetNormalizedWeights(const VectorBase &unnorm_weights, - VectorBase *norm_weights) const; - - // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets - // weights_penalty_deriv to 0.0; else it computes, for each - // updatable component u the total weight w_u, returns the value - // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2; - // and sets 'weights_penalty_deriv' to the derivative w.r.t. - // the result. - // Note: config_.sum_to_one_penalty is exclusive with - // config_.enforce_sum_to_one, so there is really no distinction between - // normalized and unnormalized weights here (since normalization would be a - // no-op). - double GetSumToOnePenalty(const VectorBase &weights, - VectorBase *weights_penalty_deriv, - bool print_weights = false) const; - - - // Computes the nnet-parameter vector from the normalized weights and - // nnet_params_, as a vector. (See the functions Vectorize() and - // UnVectorize() for how they relate to the nnet's components' parameters). - void GetNnetParameters(const Vector &normalized_weights, - VectorBase *nnet_params) const; - - // This function computes the objective function (and its derivative, if the objective - // function is finite) at the given value of nnet parameters. This involves the - // nnet computation. - double ComputeObjfAndDerivFromNnet(VectorBase &nnet_params, - VectorBase *nnet_params_deriv); - - // Given an objective-function derivative with respect to the nnet parameters, - // computes the derivative with respect to the (normalized) weights. - void GetWeightsDeriv(const VectorBase &nnet_params_deriv, - VectorBase *normalized_weights_deriv); - - - // Computes the derivative w.r.t. the unnormalized weights, by propagating - // through the normalization operation. - // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to - // unnorm_weights_deriv. - void GetUnnormalizedWeightsDeriv(const VectorBase &unnorm_weights, - const VectorBase &norm_weights_deriv, - VectorBase *unnorm_weights_deriv); - - - // Given a derivative w.r.t. the weights, outputs a derivative w.r.t. - // the params - void GetParamsDeriv(const VectorBase &weights, - const VectorBase &weight_deriv, - VectorBase *param_deriv); - - void ComputeUpdatableComponentDims(); - void FinishPreprocessingInput(); - -}; - - - -} // namespace nnet3 -} // namespace kaldi - -#endif diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc deleted file mode 100644 index fa570ec96a3..00000000000 --- a/src/nnet3/nnet-combine.cc +++ /dev/null @@ -1,606 +0,0 @@ -// nnet3/nnet-combine.cc - -// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "nnet3/nnet-combine.h" -#include "nnet3/nnet-utils.h" - -namespace kaldi { -namespace nnet3 { - -NnetCombiner::NnetCombiner(const NnetCombineConfig &config, - int32 num_nnets, - const std::vector &egs, - const Nnet &first_nnet): - config_(config), - egs_(egs), - nnet_(first_nnet), - num_real_input_nnets_(num_nnets), - nnet_params_(std::min(num_nnets, config_.max_effective_inputs), - NumParameters(first_nnet)), - tot_input_weighting_(nnet_params_.NumRows()) { - - if (config_.sum_to_one_penalty != 0.0 && - config_.enforce_sum_to_one) { - KALDI_WARN << "--sum-to-one-penalty=" << config_.sum_to_one_penalty - << " is nonzero, so setting --enforce-sum-to-one=false."; - config_.enforce_sum_to_one = false; - } - SubVector first_params(nnet_params_, 0); - VectorizeNnet(nnet_, &first_params); - tot_input_weighting_(0) += 1.0; - num_nnets_provided_ = 1; - ComputeUpdatableComponentDims(); - NnetComputeProbOptions compute_prob_opts; - compute_prob_opts.compute_deriv = true; - prob_computer_ = new NnetComputeProb(compute_prob_opts, nnet_); -} - -void NnetCombiner::ComputeUpdatableComponentDims(){ - updatable_component_dims_.clear(); - for (int32 c = 0; c < nnet_.NumComponents(); c++) { - Component *comp = nnet_.GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - // For now all updatable components inherit from class UpdatableComponent. - // If that changes in future, we will change this code. - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - "UpdatableComponent; change this code."; - updatable_component_dims_.push_back(uc->NumParameters()); - } - } -} - -void NnetCombiner::AcceptNnet(const Nnet &nnet) { - KALDI_ASSERT(num_nnets_provided_ < num_real_input_nnets_ && - "You called AcceptNnet too many times."); - int32 num_effective_nnets = nnet_params_.NumRows(); - if (num_effective_nnets == num_real_input_nnets_) { - SubVector this_params(nnet_params_, num_nnets_provided_); - VectorizeNnet(nnet, &this_params); - tot_input_weighting_(num_nnets_provided_) += 1.0; - } else { - // this_index is a kind of warped index, mapping the range - // 0 ... num_real_inputs_nnets_ - 1 onto the range - // 0 ... num_effective_nnets - 1. View the index as falling in - // between two integer indexes and determining weighting factors. - // we could view this as triangular bins. - BaseFloat this_index = num_nnets_provided_ * (num_effective_nnets - 1) - / static_cast(num_real_input_nnets_ - 1); - int32 lower_index = std::floor(this_index), - upper_index = lower_index + 1; - BaseFloat remaining_part = this_index - lower_index, - lower_weight = 1.0 - remaining_part, - upper_weight = remaining_part; - KALDI_ASSERT(lower_index >= 0 && upper_index <= num_effective_nnets && - lower_weight >= 0.0 && upper_weight >= 0.0 && - lower_weight <= 1.0 && upper_weight <= 1.0); - Vector vec(nnet_params_.NumCols(), kUndefined); - VectorizeNnet(nnet, &vec); - nnet_params_.Row(lower_index).AddVec(lower_weight, vec); - tot_input_weighting_(lower_index) += lower_weight; - if (upper_index == num_effective_nnets) { - KALDI_ASSERT(upper_weight < 0.1); - } else { - nnet_params_.Row(upper_index).AddVec(upper_weight, vec); - tot_input_weighting_(upper_index) += upper_weight; - } - } - num_nnets_provided_++; -} - -void NnetCombiner::FinishPreprocessingInput() { - KALDI_ASSERT(num_nnets_provided_ == num_real_input_nnets_ && - "You did not call AcceptInput() enough times."); - int32 num_effective_nnets = nnet_params_.NumRows(); - for (int32 i = 0; i < num_effective_nnets; i++) { - BaseFloat tot_weight = tot_input_weighting_(i); - KALDI_ASSERT(tot_weight > 0.0); // Or would be a coding error. - // Rescale so this row is like a weighted average instead of - // a weighted sum. - if (tot_weight != 1.0) - nnet_params_.Row(i).Scale(1.0 / tot_weight); - } -} - -void NnetCombiner::Combine() { - FinishPreprocessingInput(); - - if (!SelfTestDerivatives()) { - KALDI_LOG << "Self-testing model derivatives since parameter-derivatives " - "self-test failed."; - SelfTestModelDerivatives(); - } - - int32 dim = ParameterDim(); - LbfgsOptions lbfgs_options; - lbfgs_options.minimize = false; // We're maximizing. - lbfgs_options.m = dim; // Store the same number of vectors as the dimension - // itself, so this is BFGS. - lbfgs_options.first_step_impr = config_.initial_impr; - - Vector params(dim), deriv(dim); - double objf, initial_objf; - GetInitialParameters(¶ms); - - - OptimizeLbfgs lbfgs(params, lbfgs_options); - - for (int32 i = 0; i < config_.num_iters; i++) { - params.CopyFromVec(lbfgs.GetProposedValue()); - objf = ComputeObjfAndDerivFromParameters(params, &deriv); - KALDI_VLOG(2) << "Iteration " << i << " params = " << params - << ", objf = " << objf << ", deriv = " << deriv; - if (i == 0) initial_objf = objf; - lbfgs.DoStep(objf, deriv); - } - - if (!config_.sum_to_one_penalty) { - KALDI_LOG << "Combining nnets, objective function changed from " - << initial_objf << " to " << objf; - } else { - Vector weights(WeightDim()); - GetWeights(params, &weights); - bool print_weights = true; - double penalty = GetSumToOnePenalty(weights, NULL, print_weights); - // note: initial_objf has no penalty term because it summed exactly - // to one. - KALDI_LOG << "Combining nnets, objective function changed from " - << initial_objf << " to " << objf << " = " - << (objf - penalty) << " + " << penalty; - } - - - // must recompute nnet_ if "params" is not exactly equal to the - // final params that LB - Vector final_params(dim); - final_params.CopyFromVec(lbfgs.GetValue(&objf)); - if (!params.ApproxEqual(final_params, 0.0)) { - // the following call makes sure that nnet_ corresponds to the parameters - // in "params". - ComputeObjfAndDerivFromParameters(final_params, &deriv); - } - PrintParams(final_params); - -} - -void NnetCombiner::PrintParams(const VectorBase ¶ms) const { - Vector weights(WeightDim()), normalized_weights(WeightDim()); - GetWeights(params, &weights); - GetNormalizedWeights(weights, &normalized_weights); - int32 num_models = nnet_params_.NumRows(), - num_uc = NumUpdatableComponents(); - - if (config_.separate_weights_per_component) { - std::vector updatable_component_names; - for (int32 c = 0; c < nnet_.NumComponents(); c++) { - const Component *comp = nnet_.GetComponent(c); - if (comp->Properties() & kUpdatableComponent) - updatable_component_names.push_back(nnet_.GetComponentName(c)); - } - KALDI_ASSERT(static_cast(updatable_component_names.size()) == - NumUpdatableComponents()); - for (int32 uc = 0; uc < num_uc; uc++) { - std::ostringstream os; - os.width(20); - os << std::left << updatable_component_names[uc] << ": "; - os.width(9); - os.precision(4); - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + uc; - os << " " << std::left << normalized_weights(index); - } - KALDI_LOG << "Weights for " << os.str(); - } - } else { - int32 c = 0; // arbitrarily chosen; they'll all be the same. - std::ostringstream os; - os.width(9); - os.precision(4); - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - os << " " << std::left << normalized_weights(index); - } - KALDI_LOG << "Model weights are: " << os.str(); - } - int32 num_effective_nnets = nnet_params_.NumRows(); - if (num_effective_nnets != num_real_input_nnets_) - KALDI_LOG << "Above, only " << num_effective_nnets << " weights were " - "printed due to the the --max-effective-inputs option; " - "there were " << num_real_input_nnets_ << " actual input nnets. " - "Each weight corresponds to a weighted average over a range of " - "nnets in the sequence (with triangular bins)"; -} - -bool NnetCombiner::SelfTestDerivatives() { - int32 num_tests = 2; // more properly, this is the number of dimensions in a - // single test. - double delta = 0.001; - int32 dim = ParameterDim(); - - Vector params(dim), deriv(dim); - Vector predicted_changes(num_tests), - observed_changes(num_tests); - - GetInitialParameters(¶ms); - double initial_objf = ComputeObjfAndDerivFromParameters(params, - &deriv); - for (int32 i = 0; i < num_tests; i++) { - Vector new_deriv(dim), offset(dim), new_params(params); - offset.SetRandn(); - new_params.AddVec(delta, offset); - double new_objf = ComputeObjfAndDerivFromParameters(new_params, - &new_deriv); - // for predicted changes, interpolate old and new derivs. - predicted_changes(i) = - 0.5 * VecVec(new_params, deriv) - 0.5 * VecVec(params, deriv) + - 0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv); - observed_changes(i) = new_objf - initial_objf; - } - double threshold = 0.1; - KALDI_LOG << "predicted_changes = " << predicted_changes; - KALDI_LOG << "observed_changes = " << observed_changes; - if (!ApproxEqual(predicted_changes, observed_changes, threshold)) { - KALDI_WARN << "Derivatives self-test failed."; - return false; - } else { - return true; - } -} - - -void NnetCombiner::SelfTestModelDerivatives() { - int32 num_tests = 3; // more properly, this is the number of dimensions in a - // single test. - int32 dim = ParameterDim(); - - Vector params(dim), deriv(dim); - Vector predicted_changes(num_tests), - observed_changes(num_tests); - - GetInitialParameters(¶ms); - Vector weights(WeightDim()), normalized_weights(WeightDim()); - Vector nnet_params(NnetParameterDim(), kUndefined), - nnet_deriv(NnetParameterDim(), kUndefined); - GetWeights(params, &weights); - GetNormalizedWeights(weights, &normalized_weights); - GetNnetParameters(normalized_weights, &nnet_params); - - double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params, - &nnet_deriv); - - double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) / - NnetParameterDim()); - - - for (int32 i = 0; i < num_tests; i++) { - Vector new_nnet_deriv(NnetParameterDim()), - offset(NnetParameterDim()), new_nnet_params(nnet_params); - offset.SetRandn(); - new_nnet_params.AddVec(delta, offset); - double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params, - &new_nnet_deriv); - // for predicted changes, interpolate old and new derivs. - predicted_changes(i) = - 0.5 * VecVec(new_nnet_params, nnet_deriv) - - 0.5 * VecVec(nnet_params, nnet_deriv) + - 0.5 * VecVec(new_nnet_params, new_nnet_deriv) - - 0.5 * VecVec(nnet_params, new_nnet_deriv); - observed_changes(i) = new_objf - initial_objf; - } - double threshold = 0.1; - KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes; - KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes; - if (!ApproxEqual(predicted_changes, observed_changes, threshold)) - KALDI_WARN << "Model derivatives self-test failed."; -} - - - - -int32 NnetCombiner::ParameterDim() const { - if (config_.separate_weights_per_component) - return NumUpdatableComponents() * nnet_params_.NumRows(); - else - return nnet_params_.NumRows(); -} - - -void NnetCombiner::GetInitialParameters(VectorBase *params) const { - KALDI_ASSERT(params->Dim() == ParameterDim()); - params->Set(1.0 / nnet_params_.NumRows()); - if (config_.enforce_positive_weights) { - // we enforce positive weights by treating the params as the log of the - // actual weight. - params->ApplyLog(); - } -} - -void NnetCombiner::GetWeights(const VectorBase ¶ms, - VectorBase *weights) const { - KALDI_ASSERT(weights->Dim() == WeightDim()); - if (config_.separate_weights_per_component) { - weights->CopyFromVec(params); - } else { - int32 nc = NumUpdatableComponents(); - // have one parameter per row of nnet_params_, and need to repeat - // the weight for the different components. - for (int32 n = 0; n < nnet_params_.NumRows(); n++) { - for (int32 c = 0; c < nc; c++) - (*weights)(n * nc + c) = params(n); - } - } - // we enforce positive weights by having the weights be the exponential of the - // corresponding parameters. - if (config_.enforce_positive_weights) - weights->ApplyExp(); -} - - -void NnetCombiner::GetParamsDeriv(const VectorBase &weights, - const VectorBase &weights_deriv, - VectorBase *param_deriv) { - KALDI_ASSERT(weights.Dim() == WeightDim() && - param_deriv->Dim() == ParameterDim()); - Vector preexp_weights_deriv(weights_deriv); - if (config_.enforce_positive_weights) { - // to enforce positive weights we first compute weights (call these - // preexp_weights) and then take exponential. Note, d/dx exp(x) = exp(x). - // So the derivative w.r.t. the preexp_weights equals the derivative - // w.r.t. the weights, times the weights. - preexp_weights_deriv.MulElements(weights); - } - if (config_.separate_weights_per_component) { - param_deriv->CopyFromVec(preexp_weights_deriv); - } else { - int32 nc = NumUpdatableComponents(); - param_deriv->SetZero(); - for (int32 n = 0; n < nnet_params_.NumRows(); n++) - for (int32 c = 0; c < nc; c++) - (*param_deriv)(n) += preexp_weights_deriv(n * nc + c); - } -} - - -double NnetCombiner::GetSumToOnePenalty( - const VectorBase &weights, - VectorBase *weights_penalty_deriv, - bool print_weights) const { - - KALDI_ASSERT(config_.sum_to_one_penalty >= 0.0); - double penalty = config_.sum_to_one_penalty; - if (penalty == 0.0) { - weights_penalty_deriv->SetZero(); - return 0.0; - } - double ans = 0.0; - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - Vector tot_weights(num_uc); - std::ostringstream tot_weight_info; - for (int32 c = 0; c < num_uc; c++) { - double this_total_weight = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - double this_weight = weights(index); - this_total_weight += this_weight; - } - tot_weights(c) = this_total_weight; - // this_total_weight_deriv is the derivative of the penalty - // term w.r.t. this component's total weight. - double this_total_weight_deriv; - if (config_.enforce_positive_weights) { - // if config_.enforce_positive_weights is true, then we choose to - // formulate the penalty in a slightly different way.. this solves the - // problem that with the formulation in the 'else' below, if for some - // reason the total weight is << 1.0, the deriv w.r.t. the actual - // parameters gets tiny [because weight = exp(params)]. - double log_total = log(this_total_weight); - ans += -0.5 * penalty * log_total * log_total; - double log_total_deriv = -1.0 * penalty * log_total; - this_total_weight_deriv = log_total_deriv / this_total_weight; - } else { - ans += -0.5 * penalty * - (this_total_weight - 1.0) * (this_total_weight - 1.0); - this_total_weight_deriv = penalty * (1.0 - this_total_weight); - - } - if (weights_penalty_deriv != NULL) { - KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim()); - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - (*weights_penalty_deriv)(index) = this_total_weight_deriv; - } - } - } - if (print_weights) { - Vector tot_weights_float(tot_weights); - KALDI_LOG << "Total weights per component: " - << PrintVectorPerUpdatableComponent(nnet_, - tot_weights_float); - } - return ans; -} - - -void NnetCombiner::GetNnetParameters(const Vector &weights, - VectorBase *nnet_params) const { - KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols()); - nnet_params->SetZero(); - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 m = 0; m < num_models; m++) { - const SubVector src_params(nnet_params_, m); - int32 dim_offset = 0; - for (int32 c = 0; c < num_uc; c++) { - int32 index = m * num_uc + c; - BaseFloat weight = weights(index); - int32 dim = updatable_component_dims_[c]; - const SubVector src_component_params(src_params, dim_offset, - dim); - SubVector dest_component_params(*nnet_params, dim_offset, dim); - dest_component_params.AddVec(weight, src_component_params); - dim_offset += dim; - } - KALDI_ASSERT(dim_offset == nnet_params_.NumCols()); - } -} - -// compare GetNnetParameters. -void NnetCombiner::GetWeightsDeriv( - const VectorBase &nnet_params_deriv, - VectorBase *weights_deriv) { - KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() && - weights_deriv->Dim() == WeightDim()); - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 m = 0; m < num_models; m++) { - const SubVector src_params(nnet_params_, m); - int32 dim_offset = 0; - for (int32 c = 0; c < num_uc; c++) { - int32 index = m * num_uc + c; - int32 dim = updatable_component_dims_[c]; - const SubVector src_component_params(src_params, dim_offset, - dim); - const SubVector component_params_deriv(nnet_params_deriv, - dim_offset, dim); - (*weights_deriv)(index) = VecVec(src_component_params, - component_params_deriv); - dim_offset += dim; - } - KALDI_ASSERT(dim_offset == nnet_params_.NumCols()); - } -} - -double NnetCombiner::ComputeObjfAndDerivFromNnet( - VectorBase &nnet_params, - VectorBase *nnet_params_deriv) { - BaseFloat sum = nnet_params.Sum(); - // inf/nan parameters->return -inf objective. - if (!(sum == sum && sum - sum == 0)) - return -std::numeric_limits::infinity(); - // Set nnet to have these params. - UnVectorizeNnet(nnet_params, &nnet_); - - prob_computer_->Reset(); - std::vector::const_iterator iter = egs_.begin(), - end = egs_.end(); - for (; iter != end; ++iter) - prob_computer_->Compute(*iter); - double tot_weights, - tot_objf = prob_computer_->GetTotalObjective(&tot_weights); - KALDI_ASSERT(tot_weights > 0.0); - const Nnet &deriv = prob_computer_->GetDeriv(); - VectorizeNnet(deriv, nnet_params_deriv); - // we prefer to deal with normalized objective functions. - nnet_params_deriv->Scale(1.0 / tot_weights); - return tot_objf / tot_weights; -} - - -double NnetCombiner::ComputeObjfAndDerivFromParameters( - VectorBase ¶ms, - VectorBase *params_deriv) { - Vector weights(WeightDim()), normalized_weights(WeightDim()), - weights_sum_to_one_penalty_deriv(WeightDim()), - normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim()); - Vector - nnet_params(NnetParameterDim(), kUndefined), - nnet_params_deriv(NnetParameterDim(), kUndefined); - GetWeights(params, &weights); - double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv); - GetNormalizedWeights(weights, &normalized_weights); - GetNnetParameters(normalized_weights, &nnet_params); - ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv); - if (ans != ans || ans - ans != 0) // NaN or inf - return ans; // No point computing derivative - GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv); - GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv, - &weights_deriv); - weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv); - GetParamsDeriv(weights, weights_deriv, params_deriv); - return ans; -} - - -// enforces the constraint that the weights for each component must sum to one, -// if necessary. -void NnetCombiner::GetNormalizedWeights( - const VectorBase &unnorm_weights, - VectorBase *norm_weights) const { - if (!config_.enforce_sum_to_one) { - norm_weights->CopyFromVec(unnorm_weights); - return; - } - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 c = 0; c < num_uc; c++) { - double sum = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - sum += unnorm_weights(index); - } - double inv_sum = 1.0 / sum; // if it's NaN then it's OK, we'll get NaN - // weights and eventually -inf objective. - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - (*norm_weights)(index) = unnorm_weights(index) * inv_sum; - } - } -} - -void NnetCombiner::GetUnnormalizedWeightsDeriv( - const VectorBase &unnorm_weights, - const VectorBase &norm_weights_deriv, - VectorBase *unnorm_weights_deriv) { - if (!config_.enforce_sum_to_one) { - unnorm_weights_deriv->CopyFromVec(norm_weights_deriv); - return; - } - int32 num_uc = NumUpdatableComponents(), - num_models = nnet_params_.NumRows(); - for (int32 c = 0; c < num_uc; c++) { - double sum = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - sum += unnorm_weights(index); - } - double inv_sum = 1.0 / sum; - double inv_sum_deriv = 0.0; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - // in the forward direction, we'd do: - // (*norm_weights)(index) = unnorm_weights(index) * inv_sum; - (*unnorm_weights_deriv)(index) = inv_sum * norm_weights_deriv(index); - inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index); - } - // note: d/dx (1/x) = -1/x^2 - double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum; - for (int32 m = 0; m < num_models; m++) { - int32 index = m * num_uc + c; - (*unnorm_weights_deriv)(index) += sum_deriv; - } - } -} - - - - -} // namespace nnet3 -} // namespace kaldi diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h deleted file mode 100644 index 5b60d30b8ed..00000000000 --- a/src/nnet3/nnet-combine.h +++ /dev/null @@ -1,251 +0,0 @@ -// nnet3/nnet-combine.h - -// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_NNET3_NNET_COMBINE_H_ -#define KALDI_NNET3_NNET_COMBINE_H_ - -#include "nnet3/nnet-utils.h" -#include "nnet3/nnet-compute.h" -#include "util/parse-options.h" -#include "itf/options-itf.h" -#include "nnet3/nnet-diagnostics.h" - - -namespace kaldi { -namespace nnet3 { - -/** Configuration class that controls neural net combination, where we combine a - number of neural nets. -*/ -struct NnetCombineConfig { - int32 num_iters; // The dimension of the space we are optimizing in is fairly - // small (equal to the number of components times the number - // of neural nets we were given), so we optimize with BFGS - // (internally the code uses L-BFGS, but we set the the - // number of vectors to be the same as the dimension of the - // space, so it actually is regular BFGS. The num-iters - // corresponds to the number of function evaluations. - - - BaseFloat initial_impr; - int32 max_effective_inputs; - bool test_gradient; - bool enforce_positive_weights; - bool enforce_sum_to_one; - BaseFloat sum_to_one_penalty; - bool separate_weights_per_component; - NnetCombineConfig(): num_iters(60), - initial_impr(0.01), - max_effective_inputs(15), - test_gradient(false), - enforce_positive_weights(false), - enforce_sum_to_one(false), - sum_to_one_penalty(0.0), - separate_weights_per_component(true) { } - - void Register(OptionsItf *po) { - po->Register("num-iters", &num_iters, "Maximum number of function " - "evaluations for BFGS to use when optimizing combination weights"); - po->Register("max-effective-inputs", &max_effective_inputs, "Limits the number of " - "parameters that have to be learn to be equivalent to the number of " - "parameters we'd have to learn if the number of inputs nnets equalled " - "this number. Does this by using averages of nnets at close points " - "in the sequence of inputs, as the actual inputs to the computation."); - po->Register("initial-impr", &initial_impr, "Amount of objective-function change " - "we aim for on the first iteration (controls the initial step size)."); - po->Register("test-gradient", &test_gradient, "If true, activate code that " - "tests the gradient is accurate."); - po->Register("enforce-positive-weights", &enforce_positive_weights, - "If true, enforce that all weights are positive."); - po->Register("enforce-sum-to-one", &enforce_sum_to_one, "If true, enforce that " - "the model weights for each component should sum to one."); - po->Register("sum-to-one-penalty", &sum_to_one_penalty, "If >0, a penalty term " - "on the squared difference between sum(weights) for one component," - " and 1.0. This is like --enforce-sum-to-one, but done in a 'soft' " - "way (e.g. maybe useful with dropout). We suggest small values " - "like 10e-3 (for regular nnets) or 1.0e-04 (for chain models)."); - po->Register("separate-weights-per-component", &separate_weights_per_component, - "If true, have a separate weight for each updatable component in " - "the nnet."); - } -}; - - -/* - You should use this class as follows: - - Call the constructor, giving it the egs and the first nnet. - - Call AcceptNnet to provide all the other nnets. (the nnets will - be stored in a matrix in CPU memory, to avoid filing up GPU memory). - - Call Combine() - - Get the resultant nnet with GetNnet(). - */ -class NnetCombiner { - public: - /// Caution: this object retains a const reference to the "egs", so don't - /// delete them until it goes out of scope. - NnetCombiner(const NnetCombineConfig &config, - int32 num_nnets, - const std::vector &egs, - const Nnet &first_nnet); - /// You should call this function num_nnets-1 times after calling - /// the constructor, to provide the remaining nnets. - void AcceptNnet(const Nnet &nnet); - void Combine(); - const Nnet &GetNnet() const { return nnet_; } - - ~NnetCombiner() { delete prob_computer_; } - private: - NnetCombineConfig config_; - - const std::vector &egs_; - - Nnet nnet_; // The current neural network. - - NnetComputeProb *prob_computer_; - - std::vector updatable_component_dims_; // dimension of each updatable - // component. - - int32 num_real_input_nnets_; // number of actual nnet inputs. - - int32 num_nnets_provided_; // keeps track of the number of calls to AcceptNnet(). - - // nnet_params_ are the parameters of the "effective input" - // neural nets; they will often be the same as the real inputs, - // but if num_real_input_nnets_ > config_.num_effective_nnets, they - // will be weighted combinations. - Matrix nnet_params_; - - // This vector has the same dimension as nnet_params_.NumRows(), - // and helps us normalize so each row of nnet_params corresponds to - // a weighted average of its inputs (will be all ones if - // config_.max_effective_inputs >= the number of nnets provided). - Vector tot_input_weighting_; - - // returns the parameter dimension, i.e. the dimension of the parameters that - // we are optimizing. This depends on the config, the number of updatable - // components and nnet_params_.NumRows(); it will never exceed the number of - // updatable components times nnet_params_.NumRows(). - int32 ParameterDim() const; - - int32 NumUpdatableComponents() const { - return updatable_component_dims_.size(); - } - // returns the weight dimension. - int32 WeightDim() const { - return nnet_params_.NumRows() * NumUpdatableComponents(); - } - - int32 NnetParameterDim() const { return nnet_params_.NumCols(); } - - // Computes the initial parameters. The parameters are the underlying thing - // that we optimize; their dimension equals ParameterDim(). They are not the same - // thing as the nnet parameters. - void GetInitialParameters(VectorBase *params) const; - - // Tests that derivatives are accurate. Prints warning and returns false if not. - bool SelfTestDerivatives(); - - // Tests that model derivatives are accurate. Just prints warning if not. - void SelfTestModelDerivatives(); - - - // prints the parameters via logging statements. - void PrintParams(const VectorBase ¶ms) const; - - // This function computes the objective function (and its derivative, if the objective - // function is finite) at the given value of the parameters (the parameters we're optimizing, - // i.e. the combination weights; not the nnet parameters. This function calls most of the - // functions below. - double ComputeObjfAndDerivFromParameters( - VectorBase ¶ms, - VectorBase *params_deriv); - - - // Computes the weights from the parameters in a config-dependent way. The - // weight dimension is always (the number of updatable components times - // nnet_params_.NumRows()). - void GetWeights(const VectorBase ¶ms, - VectorBase *weights) const; - - // Given the raw weights: if config_.enforce_sum_to_one, then compute weights - // with sum-to-one constrint per component included; else just copy input to - // output. - void GetNormalizedWeights(const VectorBase &unnorm_weights, - VectorBase *norm_weights) const; - - // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets - // weights_penalty_deriv to 0.0; else it computes, for each - // updatable component u the total weight w_u, returns the value - // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2; - // and sets 'weights_penalty_deriv' to the derivative w.r.t. - // the result. - // Note: config_.sum_to_one_penalty is exclusive with - // config_.enforce_sum_to_one, so there is really no distinction between - // normalized and unnormalized weights here (since normalization would be a - // no-op). - double GetSumToOnePenalty(const VectorBase &weights, - VectorBase *weights_penalty_deriv, - bool print_weights = false) const; - - - // Computes the nnet-parameter vector from the normalized weights and - // nnet_params_, as a vector. (See the functions Vectorize() and - // UnVectorize() for how they relate to the nnet's components' parameters). - void GetNnetParameters(const Vector &normalized_weights, - VectorBase *nnet_params) const; - - // This function computes the objective function (and its derivative, if the objective - // function is finite) at the given value of nnet parameters. This involves the - // nnet computation. - double ComputeObjfAndDerivFromNnet(VectorBase &nnet_params, - VectorBase *nnet_params_deriv); - - // Given an objective-function derivative with respect to the nnet parameters, - // computes the derivative with respect to the (normalized) weights. - void GetWeightsDeriv(const VectorBase &nnet_params_deriv, - VectorBase *normalized_weights_deriv); - - - // Computes the derivative w.r.t. the unnormalized weights, by propagating - // through the normalization operation. - // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to - // unnorm_weights_deriv. - void GetUnnormalizedWeightsDeriv(const VectorBase &unnorm_weights, - const VectorBase &norm_weights_deriv, - VectorBase *unnorm_weights_deriv); - - - // Given a derivative w.r.t. the weights, outputs a derivative w.r.t. - // the params - void GetParamsDeriv(const VectorBase &weights, - const VectorBase &weight_deriv, - VectorBase *param_deriv); - - void ComputeUpdatableComponentDims(); - void FinishPreprocessingInput(); - -}; - - - -} // namespace nnet3 -} // namespace kaldi - -#endif diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index 5d67715a228..d2db4b4df67 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -28,42 +28,49 @@ namespace kaldi { namespace nnet3 { -double ComputeObjf(const std::vector &egs, +// Computes the objective of the moving average of nnet on egs. If either of +// batchnorm/dropout test modes is true, we make a copy of the moving average, +// set test modes on that and evaluate its objective. Note: the object that +// prob_computer->nnet_ refers to should be moving_average_nnet. +double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, + const std::vector &egs, + const Nnet &moving_average_nnet, NnetComputeProb *prob_computer) { - prob_computer->Reset(); - std::vector::const_iterator iter = egs.begin(), - end = egs.end(); - for (; iter != end; ++iter) - prob_computer->Compute(*iter); - double tot_weights, - tot_objf = prob_computer->GetTotalObjective(&tot_weights); - KALDI_ASSERT(tot_weights > 0.0); - // we prefer to deal with normalized objective functions. - return tot_objf / tot_weights; + if (batchnorm_test_mode || dropout_test_mode) { + Nnet moving_average_nnet_copy(moving_average_nnet); + if (batchnorm_test_mode) + SetBatchnormTestMode(true, &moving_average_nnet_copy); + if (dropout_test_mode) + SetDropoutTestMode(true, &moving_average_nnet_copy); + NnetComputeProbOptions compute_prob_opts; + NnetComputeProb prob_computer_test(compute_prob_opts, + moving_average_nnet_copy); + return ComputeObjf(false, false, egs, moving_average_nnet_copy, + &prob_computer_test); + } else { + prob_computer->Reset(); + std::vector::const_iterator iter = egs.begin(), + end = egs.end(); + for (; iter != end; ++iter) + prob_computer->Compute(*iter); + double tot_weights, + tot_objf = prob_computer->GetTotalObjective(&tot_weights); + KALDI_ASSERT(tot_weights > 0.0); + // inf/nan tot_objf->return -inf objective. + if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0)) + return -std::numeric_limits::infinity(); + // we prefer to deal with normalized objective functions. + return tot_objf / tot_weights; + } } -// Note: the object that prob_computer.nnet_ refers to should be -// *moving_average_nnet. -double UpdateNnetMovingAverageAndComputeObjf(int32 num_models, - const std::vector &egs, - const Nnet &nnet, Nnet *moving_average_nnet, - NnetComputeProb *prob_computer) { - int32 num_params = NumParameters(nnet); - KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet)); - Vector nnet_params(num_params, kUndefined), - moving_average_nnet_params(num_params, kUndefined); - VectorizeNnet(nnet, &nnet_params); - VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params); - moving_average_nnet_params.Scale((num_models - 1.0) / num_models); - moving_average_nnet_params.AddVec(1.0 / num_models, nnet_params); - - BaseFloat sum = moving_average_nnet_params.Sum(); - // inf/nan parameters->return -inf objective. - if (!(sum == sum && sum - sum == 0)) - return -std::numeric_limits::infinity(); - - UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet); - return ComputeObjf(egs, prob_computer); +// Updates moving average over num_models nnets, given the average over +// previous (num_models - 1) nnets, and the new nnet. +void UpdateNnetMovingAverage(int32 num_models, + const Nnet &nnet, Nnet *moving_average_nnet) { + KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet)); + ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet); + AddNnet(nnet, 1.0 / num_models, moving_average_nnet); } } @@ -80,7 +87,7 @@ int main(int argc, char *argv[]) { "Using a subset of training or held-out examples, compute the average\n" "over the first n nnet3 models where we maxize the objective function\n" "for n. Note that the order of models has been reversed before\n" - "feeding into this binary. So we are actually combining last n models.\n" + "being fed into this binary. So we are actually combining last n models.\n" "Inputs and outputs are 'raw' nnets.\n" "\n" "Usage: nnet3-combine [options] ... \n" @@ -89,17 +96,24 @@ int main(int argc, char *argv[]) { " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n"; bool binary_write = true; + int32 max_objective_evaluations = 30; bool batchnorm_test_mode = false, dropout_test_mode = true; std::string use_gpu = "yes"; ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("max-objective-evaluations", &max_objective_evaluations, "Max " + "number of objective evaluations in order to figure out the " + "best number of models to combine. It helps to speedup if " + "the number of models provided to this binary is quite large " + "(e.g. several hundred)."); po.Register("batchnorm-test-mode", &batchnorm_test_mode, - "If true, set test-mode to true on any BatchNormComponents."); + "If true, set test-mode to true on any BatchNormComponents " + "while evaluating objectives."); po.Register("dropout-test-mode", &dropout_test_mode, "If true, set test-mode to true on any DropoutComponents and " - "DropoutMaskComponents."); + "DropoutMaskComponents while evaluating objectives."); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); @@ -123,13 +137,7 @@ int main(int argc, char *argv[]) { ReadKaldiObject(nnet_rxfilename, &nnet); Nnet moving_average_nnet(nnet), best_nnet(nnet); NnetComputeProbOptions compute_prob_opts; - NnetComputeProb *prob_computer = new NnetComputeProb(compute_prob_opts, - moving_average_nnet); - - if (batchnorm_test_mode) - SetBatchnormTestMode(true, &nnet); - if (dropout_test_mode) - SetDropoutTestMode(true, &nnet); + NnetComputeProb prob_computer(compute_prob_opts, moving_average_nnet); std::vector egs; egs.reserve(10000); // reserve a lot of space to minimize the chance of @@ -144,25 +152,35 @@ int main(int argc, char *argv[]) { KALDI_ASSERT(!egs.empty()); } - int32 best_n = 1; - double best_objf = ComputeObjf(egs, prob_computer); + // first evaluates the objective using the last model. + int32 best_num_to_combine = 1; + double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, &prob_computer); KALDI_LOG << "objective function using the last model is " << best_objf; - int32 num_inputs = po.NumArgs() - 2; - if (num_inputs > 1) { - for (int32 n = 1; n < num_inputs; n++) { + int32 num_nnets = po.NumArgs() - 2; + // then each time before we re-evaluate the objective function, we will add + // num_to_add models to the moving average. + int32 num_to_add = (num_nnets + max_objective_evaluations - 1) / + max_objective_evaluations; + if (num_nnets > 1) { + for (int32 n = 1; n < num_nnets; n++) { ReadKaldiObject(po.GetArg(1 + n), &nnet); - double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet, - &moving_average_nnet, prob_computer); - KALDI_LOG << "Combining last " << n + 1 - << " models, objective function is " << objf; - if (objf > best_objf) { - best_objf = objf; - best_nnet = moving_average_nnet; - best_n = n + 1; + // updates the moving average + UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet); + if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) { + double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, &prob_computer); + KALDI_LOG << "Combining last " << n + 1 + << " models, objective function is " << objf; + if (objf > best_objf) { + best_objf = objf; + best_nnet = moving_average_nnet; + best_num_to_combine = n + 1; + } } } - KALDI_LOG << "Using the model averaged over last " << best_n + KALDI_LOG << "Using the model averaged over last " << best_num_to_combine << " models, objective function is " << best_objf; #if HAVE_CUDA==1 From 88e2914be72fb0b49ecd96ea258cfe5f77b5fdaa Mon Sep 17 00:00:00 2001 From: freewym Date: Mon, 11 Dec 2017 21:07:51 -0500 Subject: [PATCH 011/184] python-level changes, added more documentations. --- .../nnet3/train/chain_objf/acoustic_model.py | 17 ++++------------- egs/wsj/s5/steps/libs/nnet3/train/common.py | 12 +++++++++++- .../libs/nnet3/train/frame_level_objf/common.py | 11 ++++------- egs/wsj/s5/steps/nnet3/chain/train.py | 2 +- egs/wsj/s5/steps/nnet3/train_dnn.py | 2 +- egs/wsj/s5/steps/nnet3/train_raw_dnn.py | 2 +- egs/wsj/s5/steps/nnet3/train_raw_rnn.py | 2 +- egs/wsj/s5/steps/nnet3/train_rnn.py | 2 +- src/chainbin/nnet3-chain-combine.cc | 12 +++++++----- src/nnet3bin/nnet3-combine.cc | 12 +++++++----- 10 files changed, 38 insertions(+), 36 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 02a3b4c75d5..5b640510ea1 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -492,7 +492,7 @@ def compute_progress(dir, iter, run_opts): def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, - sum_to_one_penalty=0.0): + max_objective_evaluations=30): """ Function to do model combination In the nnet3 setup, the logic @@ -505,9 +505,6 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st models_to_combine.add(num_iters) - # TODO: if it turns out the sum-to-one-penalty code is not useful, - # remove support for it. - for iter in sorted(models_to_combine): model_file = '{0}/{1}.mdl'.format(dir, iter) if os.path.exists(model_file): @@ -528,12 +525,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ - nnet3-chain-combine --num-iters={opt_iters} \ + nnet3-chain-combine \ + --max-objective-evaluations={max_objective_evaluations} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ - --separate-weights-per-component={separate_weights} \ - --enforce-sum-to-one={hard_enforce} \ - --sum-to-one-penalty={penalty} \ - --enforce-positive-weights=true \ --verbose=3 {dir}/den.fst {raw_models} \ "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ @@ -542,12 +536,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st {dir}/final.mdl""".format( command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, - opt_iters=(20 if sum_to_one_penalty <= 0 else 80), - separate_weights=(sum_to_one_penalty > 0), + max_objective_evaluations=max_objective_evaluations, l2=l2_regularize, leaky=leaky_hmm_coefficient, dir=dir, raw_models=" ".join(raw_model_strings), - hard_enforce=(sum_to_one_penalty <= 0), - penalty=sum_to_one_penalty, num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, egs_dir=egs_dir)) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index b3b443ceb4c..8168d2f94a6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -852,6 +852,16 @@ def __init__(self, the final model combination stage. These models will themselves be averages of iteration-number ranges""") + self.parser.add_argument("--trainer.optimization.max-objective-evaluations", + "--trainer.max-objective-evaluations", + type=int, dest='max_objective_evaluations', + default=30, + help="""The maximum number of objective + evaluations in order to figure out the + best number of models to combine. It helps to + speedup if the number of models provided to the + model combination binary is quite large (e.g. + several hundred).""") self.parser.add_argument("--trainer.optimization.do-final-combination", dest='do_final_combination', type=str, action=common_lib.StrToBoolAction, @@ -863,7 +873,7 @@ def __init__(self, type=float, dest='combine_sum_to_one_penalty', default=0.0, help="""If > 0, activates 'soft' enforcement of the sum-to-one penalty in combination (may be helpful - if using dropout). E.g. 1.0e-03.""") + if using dropout). E.g. 1.0e-03. It is deprecated.""") self.parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', default=0.0, help="""Momentum used in update computation. diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index f8a69c5ad84..46eec2e3b87 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -452,7 +452,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, minibatch_size_str, run_opts, chunk_width=None, get_raw_nnet_from_am=True, - sum_to_one_penalty=0.0, + max_objective_evaluations=30, use_multitask_egs=False, compute_per_dim_accuracy=False): """ Function to do model combination @@ -501,10 +501,8 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, use_multitask_egs=use_multitask_egs) common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ - nnet3-combine --num-iters=80 \ - --enforce-sum-to-one={hard_enforce} \ - --sum-to-one-penalty={penalty} \ - --enforce-positive-weights=true \ + nnet3-combine \ + --max-objective-evaluations={max_objective_evaluations} \ --verbose=3 {raw_models} \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ @@ -513,9 +511,8 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, """.format(command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, dir=dir, raw_models=" ".join(raw_model_strings), + max_objective_evaluations=max_objective_evaluations, egs_rspecifier=egs_rspecifier, - hard_enforce=(sum_to_one_penalty <= 0), - penalty=sum_to_one_penalty, mbsize=minibatch_size_str, out_model=out_model, multitask_egs_opts=multitask_egs_opts)) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index d23c379e104..b62f5510e3c 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -554,7 +554,7 @@ def train(args, run_opts): l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) + max_objective_evaluations=args.max_objective_evaluations) else: logger.info("Copying the last-numbered model to final.mdl") common_lib.force_symlink("{0}.mdl".format(num_iters), diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 87a1fd5afed..073ad3e7d7a 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -364,7 +364,7 @@ def train(args, run_opts): models_to_combine=models_to_combine, egs_dir=egs_dir, minibatch_size_str=args.minibatch_size, run_opts=run_opts, - sum_to_one_penalty=args.combine_sum_to_one_penalty) + max_objective_evaluations=args.max_objective_evaluations) if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 38396f0b4e7..2d092ceebc7 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -398,7 +398,7 @@ def train(args, run_opts): models_to_combine=models_to_combine, egs_dir=egs_dir, minibatch_size_str=args.minibatch_size, run_opts=run_opts, get_raw_nnet_from_am=False, - sum_to_one_penalty=args.combine_sum_to_one_penalty, + max_objective_evaluations=args.max_objective_evaluations, use_multitask_egs=use_multitask_egs) else: common_lib.force_symlink("{0}.raw".format(num_iters), diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index c9ffcf7ff2c..b51632e7d2c 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -475,7 +475,7 @@ def train(args, run_opts): run_opts=run_opts, chunk_width=args.chunk_width, get_raw_nnet_from_am=False, compute_per_dim_accuracy=args.compute_per_dim_accuracy, - sum_to_one_penalty=args.combine_sum_to_one_penalty) + max_objective_evaluations=args.max_objective_evaluations) else: common_lib.force_symlink("{0}.raw".format(num_iters), "{0}/final.raw".format(args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index e6f81b03c3b..005e751cae0 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -451,7 +451,7 @@ def train(args, run_opts): run_opts=run_opts, minibatch_size_str=args.num_chunk_per_minibatch, chunk_width=args.chunk_width, - sum_to_one_penalty=args.combine_sum_to_one_penalty, + max_objective_evaluations=args.max_objective_evaluations, compute_per_dim_accuracy=args.compute_per_dim_accuracy) if args.stage <= num_iters + 1: diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 520575e1d88..80cf72e2da3 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -111,11 +111,11 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("max-objective-evaluations", &max_objective_evaluations, "Max " - "number of objective evaluations in order to figure out the " - "best number of models to combine. It helps to speedup if " - "the number of models provided to this binary is quite large " - "(e.g. several hundred)."); + po.Register("max-objective-evaluations", &max_objective_evaluations, "The " + "maximum number of objective evaluations in order to figure " + "out the best number of models to combine. It helps to speedup " + "if the number of models provided to this binary is quite " + "large (e.g. several hundred)."); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); po.Register("batchnorm-test-mode", &batchnorm_test_mode, @@ -184,6 +184,8 @@ int main(int argc, char *argv[]) { ReadKaldiObject(this_nnet_rxfilename, &nnet); // updates the moving average UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet); + // evaluates the objective everytime after adding num_to_add model or + // all the models to the moving average. if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) { double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, egs, moving_average_nnet, chain_config, den_fst, &prob_computer); diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index d2db4b4df67..a38eb3eeddd 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -103,11 +103,11 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("max-objective-evaluations", &max_objective_evaluations, "Max " - "number of objective evaluations in order to figure out the " - "best number of models to combine. It helps to speedup if " - "the number of models provided to this binary is quite large " - "(e.g. several hundred)."); + po.Register("max-objective-evaluations", &max_objective_evaluations, "The " + "maximum number of objective evaluations in order to figure " + "out the best number of models to combine. It helps to speedup " + "if the number of models provided to this binary is quite " + "large (e.g. several hundred)."); po.Register("batchnorm-test-mode", &batchnorm_test_mode, "If true, set test-mode to true on any BatchNormComponents " "while evaluating objectives."); @@ -168,6 +168,8 @@ int main(int argc, char *argv[]) { ReadKaldiObject(po.GetArg(1 + n), &nnet); // updates the moving average UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet); + // evaluates the objective everytime after adding num_to_add model or + // all the models to the moving average. if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) { double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, egs, moving_average_nnet, &prob_computer); From 49827d473ba8e84f2ae9e34c29b92de80ab18bbc Mon Sep 17 00:00:00 2001 From: freewym Date: Tue, 12 Dec 2017 00:25:03 -0500 Subject: [PATCH 012/184] fix Makefile --- src/nnet3/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 3236c52d60f..8ddba56b0e0 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -22,9 +22,9 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \ nnet-example.o nnet-nnet.o nnet-compile-utils.o \ nnet-utils.o nnet-compute.o nnet-test-utils.o nnet-analyze.o \ nnet-example-utils.o nnet-training.o \ - nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \ + nnet-diagnostics.o nnet-am-decodable-simple.o \ nnet-optimize-utils.o nnet-chain-example.o \ - nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \ + nnet-chain-training.o nnet-chain-diagnostics.o \ discriminative-supervision.o nnet-discriminative-example.o \ nnet-discriminative-diagnostics.o \ discriminative-training.o nnet-discriminative-training.o \ From b2967572cc59089e15f26021f308feca991c7a3c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 12 Dec 2017 17:46:19 -0500 Subject: [PATCH 013/184] [src,scripts] Changing which model stats are accumulated on (will affect memory-norm). Other small changes to MemoryNormComponent, will rework most of this. Adding script changes for memory-norm. --- .../steps/libs/nnet3/xconfig/basic_layers.py | 7 ++++ egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 + src/nnet3/nnet-chain-training.cc | 10 ++++- src/nnet3/nnet-compute.cc | 38 +++++++++++++------ src/nnet3/nnet-compute.h | 32 ++++++++++++++-- src/nnet3/nnet-normalize-component.cc | 31 +++++++++++---- src/nnet3/nnet-normalize-component.h | 3 +- src/nnet3/nnet-simple-component.cc | 6 +++ src/nnet3/nnet-training.cc | 10 ++++- 9 files changed, 110 insertions(+), 28 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 05ae5bcdc18..483883fdee4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -802,6 +802,13 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ''.format(self.name, nonlinearity, output_dim, target_rms)) + elif nonlinearity == 'memnorm': + line = ('component name={0}.{1}' + ' type=MemoryNormComponent dim={2}' + ' target-rms={3} include-indirect-derivative=false ' + ''.format(self.name, nonlinearity, output_dim, + target_rms)) + elif nonlinearity == 'so': line = ('component name={0}.{1}' ' type=ScaleAndOffsetComponent dim={2} max-change=0.5 ' diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index c1cad89824c..127ea816a39 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -25,6 +25,7 @@ 'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer, 'relu-dropout-layer': xlayers.XconfigBasicLayer, 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, + 'relu-memnorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, 'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer, 'sigmoid-layer' : xlayers.XconfigBasicLayer, diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 780a7115a8a..de351e6c543 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -94,8 +94,11 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, const NnetComputation &computation) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(nnet_config.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); @@ -130,8 +133,11 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, const NnetComputation &computation, bool is_backstitch_step1) { const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(nnet_config.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.inputs); computer.Run(); diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 87fa62c6112..31d60e1ce4a 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -30,22 +30,37 @@ NnetComputer::NnetComputer(const NnetComputeOptions &options, const Nnet &nnet, Nnet *nnet_to_update): options_(options), computation_(computation), nnet_(nnet), - program_counter_(0), nnet_to_update_(nnet_to_update) { - KALDI_ASSERT(computation.indexes_cuda.size() == computation.indexes.size() && - computation.indexes_ranges_cuda.size() == computation.indexes_ranges.size() && + program_counter_(0), nnet_to_store_stats_(nnet_to_update), + nnet_to_update_(nnet_to_update) { + Init(); +} + +NnetComputer::NnetComputer(const NnetComputeOptions &options, + const NnetComputation &computation, + Nnet *nnet, + Nnet *nnet_to_update): + options_(options), computation_(computation), nnet_(*nnet), + program_counter_(0), nnet_to_store_stats_(nnet), + nnet_to_update_(nnet_to_update) { + Init(); +} + +void NnetComputer::Init() { + KALDI_ASSERT(computation_.indexes_cuda.size() == computation_.indexes.size() && + computation_.indexes_ranges_cuda.size() == computation_.indexes_ranges.size() && "You must call NnetComputation::ComputeCudaIndexes() before " "executing the computation."); - matrices_.resize(computation.matrices.size()); + matrices_.resize(computation_.matrices.size()); debug_ = (options_.debug || GetVerboseLevel() >= 5); if (debug_) { ComputationVariables variables; - variables.Init(computation); - ComputeCommandAttributes(nnet, computation, variables, + variables.Init(computation_); + ComputeCommandAttributes(nnet_, computation_, variables, &command_attributes_); std::string preamble; - computation.GetCommandStrings(nnet, &preamble, &command_strings_); + computation_.GetCommandStrings(nnet_, &preamble, &command_strings_); KALDI_LOG << preamble; - computation.GetSubmatrixStrings(nnet, &submatrix_strings_); + computation_.GetSubmatrixStrings(nnet_, &submatrix_strings_); } } @@ -177,6 +192,7 @@ NnetComputer::NnetComputer(const NnetComputer &other): nnet_(other.nnet_), program_counter_(other.program_counter_), pending_commands_(other.pending_commands_), + nnet_to_store_stats_(other.nnet_to_store_stats_), nnet_to_update_(other.nnet_to_update_), debug_(other.debug_), command_attributes_(other.command_attributes_), @@ -226,14 +242,14 @@ void NnetComputer::ExecuteCommand() { CuSubMatrix output(GetSubMatrix(c.arg4)); void *memo = component->Propagate(indexes, input, &output); if (c.arg6) { // need to store stats. - KALDI_ASSERT(nnet_to_update_ != NULL); - Component *upd_component = nnet_to_update_->GetComponent(c.arg1); + KALDI_ASSERT(nnet_to_store_stats_ != NULL); + Component *stats_component = nnet_to_store_stats_->GetComponent(c.arg1); bool was_in_place = (c.arg3 == c.arg4); // if propagate was in-place, provide empty matrix and not 'input', as // input is no longer valid. const CuSubMatrix maybe_input( GetSubMatrix(was_in_place ? 0 : c.arg3)); - upd_component->StoreStats(maybe_input, output, memo); + stats_component->StoreStats(maybe_input, output, memo); } SaveMemo(c.arg5, *component, memo); break; diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h index e16cbfbb393..869dd107bf6 100644 --- a/src/nnet3/nnet-compute.h +++ b/src/nnet3/nnet-compute.h @@ -62,15 +62,29 @@ class NnetComputer { /// model update or model-derivative computation. /// You must call computation.ComputeCudaIndexes() before calling /// this function. + /// + /// Caution: there is another constructor that takes a pointer for + /// 'nnet', be careful not to mix these up. NnetComputer(const NnetComputeOptions &options, const NnetComputation &computation, const Nnet &nnet, Nnet *nnet_to_update); - /// Copy constructor. May not be used if memos are involved (memos are only - /// possible if backprop will take place, and in these situations you won't - /// normally be wanting to use the copy constructor anyway; the copy - /// constructor is more useful for things like RNNLM lattice rescoring). + /// This version of the constructor accepts a pointer to 'nnet' instead + /// of a const reference. The difference is that this version will, + /// for storing statistics (the StoreStats() function of class Component), + /// use 'nnet' instead of 'nnet_to_update' (if specified). + NnetComputer(const NnetComputeOptions &options, + const NnetComputation &computation, + Nnet *nnet, + Nnet *nnet_to_update); + + + /// Copy constructor. May not be used if memos are stored with this object + /// (which is only a possibility if backprop will take place, and in these + /// situations you won't normally be wanting to use the copy constructor + /// anyway; the copy constructor is more useful for things like RNNLM lattice + /// rescoring). NnetComputer(const NnetComputer &other); /// e.g. AcceptInput ("input", &input_mat), or for derivatives w.r.t. the @@ -112,9 +126,12 @@ class NnetComputer { private: + void Init(); // called from constructors. + const NnetComputeOptions &options_; const NnetComputation &computation_; const Nnet &nnet_; + int32 program_counter_; // command index to execute next. // To deal with inputs and outputs that are not provided/taken by the user in // the same order as listed in the computation, pending_commands_ contains a @@ -122,6 +139,13 @@ class NnetComputer { // executed. std::vector pending_commands_; + // A pointer to the copy of the nnet which we'll be using for stats + // accumulation (the StoreStats() function). May be NULL or the same + // as nnet_ or nnet_to_update_. + Nnet *nnet_to_store_stats_; + // A pointer to the copy of the nnet which we'll be updating the parameters + // of (nnet_to_update in the backprop function). May be NULL and usually + // will not be the same as nnet_. Nnet *nnet_to_update_; bool debug_; // command_attributes_ is only used if debug_=true. diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index 7eca8594748..596ab48b9e5 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -545,7 +545,6 @@ void BatchNormComponent::Backprop( // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2". in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0); // At this point, in_deriv contains x'(i) = x_deriv_base(i) + alpha y(i). - } else { KALDI_ASSERT(offset_.Dim() == block_dim_); // the next call does no work if they point to the same memory. @@ -867,18 +866,27 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, // From this point, we can assume that the num-cols of 'in' and 'out' // equals block_dim_. - Memo *ans = NULL; - if (!test_mode_) - ans = GetMemo(in); + Memo *memo = NULL; + if (!test_mode_) { + memo = GetMemo(in); + if (false) { // temporary. + MemoryNormComponent *temp = new MemoryNormComponent(*this); + temp->StoreStats(in, *out, memo); + Memo *new_memo = temp->GetMemo(in); + delete memo; + memo = new_memo; + delete temp; + } + } - if (test_mode_ || stats_count_ > 0.0) { + if (test_mode_) { CuSubVector x_mean(data_, 0), scale(data_, 4); out->AddVecToRows(-1.0, x_mean); out->MulColsVec(scale); } else { - CuSubVector x_sum(memo->data, 0), + CuSubVector x_mean(memo->data, 5), scale(memo->data, 2); - out->AddVecToRows(-1.0 / memo->num_frames, x_sum); + out->AddVecToRows(-1.0, x_mean); out->MulColsVec(scale); } return memo; @@ -891,7 +899,7 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( Memo *memo = new Memo; int32 num_frames = in.NumRows(); memo->num_frames = num_frames; - memo->data.Resize(5, block_dim_); + memo->data.Resize(6, block_dim_); CuSubVector x_sum(memo->data, 0), x_sumsq(memo->data, 1); x_sum.AddRowSumMat(1.0, in, 0.0); @@ -905,6 +913,8 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( // just copy over the scale. x_deriv and scale_deriv remain zero. memo->data.Row(2).CopyFromVec(data_.Row(4)); } + // get 'x_mean' + memo->data.Row(5).CopyFromVec(data_.Row(0)); } else { // We should only reach this point on when processing the first // minibatch of each training job. @@ -926,6 +936,11 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( // At this point 'scale' is the variance plus epsilon. scale.ApplyPow(-0.5); // OK, now 'scale' is the actual scale: the inverse standard deviation. + + // get 'x_mean' + CuSubVector x_mean(memo->data, 5); + x_mean.CopyFromVec(x_sum); + x_mean.Scale(1.0 / num_frames); } return memo; } diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index 68506174eb7..bc80421700a 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -403,7 +403,7 @@ class MemoryNormComponent: public Component { // The number of frames (after any reshaping; so in general it will // be the original NumRows() of the matrix, times dim_ / block_dim_). int32 num_frames; - // 'data' is of dimension 5 by block_dim_. + // 'data' is of dimension 6 by block_dim_. // Row 0, which we'll call 'x_sum', is the sum of the rows of the // input data. // Row 1, which we'll call 'x_sumsq', is the sum of the rows of the @@ -423,6 +423,7 @@ class MemoryNormComponent: public Component { // instead of the possibly-updated values that might exist when // Backprop() is called. It's actually not clear whether this is // necessary. + // Row 5 ('x_mean') is a copy of the data mean the data wasnormalized with. CuMatrix data; // This is set to true if we have the 'indirect' terms in the derivative, diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index e76f7cae2a7..4d29ba3a070 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -4895,6 +4895,12 @@ void CompositeComponent::Init(const std::vector &components, max_rows_process_ = max_rows_process; for (size_t i = 0; i < components_.size(); i++) { + if (components_[i]->Type() == "MemoryNormComponent") { + // This is out of concerns about the fact that the stats accumulation + // is done in the backprop, not in the forward propagation. + KALDI_ERR << "MemoryNormComponent cannot currently exist inside " + "CompositeComponent"; + } // make sure all constituent components are simple. KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent); if (i > 0) { diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 30cd47b3eb2..c606db034fb 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -87,8 +87,11 @@ void NnetTrainer::Train(const NnetExample &eg) { void NnetTrainer::TrainInternal(const NnetExample &eg, const NnetComputation &computation) { + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(config_.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.io); computer.Run(); @@ -121,8 +124,11 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, const NnetComputation &computation, bool is_backstitch_step1) { + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. This is mainly important for memory-norm. NnetComputer computer(config_.compute_config, computation, - *nnet_, delta_nnet_); + nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.io); computer.Run(); From 8a839ef5340cc1e79dab038987778912a7a793e8 Mon Sep 17 00:00:00 2001 From: freewym Date: Tue, 12 Dec 2017 20:26:34 -0500 Subject: [PATCH 014/184] fix --- egs/wsj/s5/steps/info/chain_dir_info.pl | 3 + egs/wsj/s5/steps/info/nnet3_dir_info.pl | 3 + egs/wsj/s5/steps/libs/nnet3/train/common.py | 3 +- src/chainbin/nnet3-chain-combine.cc | 13 +++-- src/nnet3bin/nnet3-combine.cc | 63 ++++++++++----------- 5 files changed, 46 insertions(+), 39 deletions(-) diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl index b0adb7e498c..d0fac5292c6 100755 --- a/egs/wsj/s5/steps/info/chain_dir_info.pl +++ b/egs/wsj/s5/steps/info/chain_dir_info.pl @@ -137,6 +137,9 @@ sub get_combine_info { if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) { close(F); return sprintf(" combine=%.3f->%.3f", $1, $2); + } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) { + close(F); + return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); } } } diff --git a/egs/wsj/s5/steps/info/nnet3_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_dir_info.pl index 06d07a63755..4b0e774a592 100755 --- a/egs/wsj/s5/steps/info/nnet3_dir_info.pl +++ b/egs/wsj/s5/steps/info/nnet3_dir_info.pl @@ -137,6 +137,9 @@ sub get_combine_info { if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) { close(F); return sprintf(" combine=%.2f->%.2f", $1, $2); + } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) { + close(F); + return sprintf(" combine=%.2f->%.2f (over %d)", $2, $3, $1); } } } diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 8168d2f94a6..7312cc09fae 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -873,7 +873,8 @@ def __init__(self, type=float, dest='combine_sum_to_one_penalty', default=0.0, help="""If > 0, activates 'soft' enforcement of the sum-to-one penalty in combination (may be helpful - if using dropout). E.g. 1.0e-03. It is deprecated.""") + if using dropout). E.g. 1.0e-03. This option is + deprecated and does nothing.""") self.parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', default=0.0, help="""Momentum used in update computation. diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 80cf72e2da3..ac1b40d29db 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -170,9 +170,11 @@ int main(int argc, char *argv[]) { // first evaluates the objective using the last model. int32 best_num_to_combine = 1; - double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, - egs, moving_average_nnet, chain_config, den_fst, &prob_computer); - KALDI_LOG << "objective function using the last model is " << best_objf; + double + init_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, chain_config, den_fst, &prob_computer), + best_objf = init_objf; + KALDI_LOG << "objective function using the last model is " << init_objf; int32 num_nnets = po.NumArgs() - 3; // then each time before we re-evaluate the objective function, we will add @@ -198,8 +200,9 @@ int main(int argc, char *argv[]) { } } } - KALDI_LOG << "Using the model averaged over last " << best_num_to_combine - << " models, objective function is " << best_objf; + KALDI_LOG << "Combining " << best_num_to_combine + << " nnets, objective function changed from " << init_objf + << " to " << best_objf; if (HasBatchnorm(nnet)) RecomputeStats(egs, chain_config, den_fst, &best_nnet); diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index a38eb3eeddd..220dd663e30 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) { "being fed into this binary. So we are actually combining last n models.\n" "Inputs and outputs are 'raw' nnets.\n" "\n" - "Usage: nnet3-combine [options] ... \n" + "Usage: nnet3-combine [options] ... \n" "\n" "e.g.:\n" " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n"; @@ -154,50 +154,47 @@ int main(int argc, char *argv[]) { // first evaluates the objective using the last model. int32 best_num_to_combine = 1; - double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, - egs, moving_average_nnet, &prob_computer); - KALDI_LOG << "objective function using the last model is " << best_objf; + double + init_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, &prob_computer), + best_objf = init_objf; + KALDI_LOG << "objective function using the last model is " << init_objf; int32 num_nnets = po.NumArgs() - 2; // then each time before we re-evaluate the objective function, we will add // num_to_add models to the moving average. int32 num_to_add = (num_nnets + max_objective_evaluations - 1) / max_objective_evaluations; - if (num_nnets > 1) { - for (int32 n = 1; n < num_nnets; n++) { - ReadKaldiObject(po.GetArg(1 + n), &nnet); - // updates the moving average - UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet); - // evaluates the objective everytime after adding num_to_add model or - // all the models to the moving average. - if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) { - double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, - egs, moving_average_nnet, &prob_computer); - KALDI_LOG << "Combining last " << n + 1 - << " models, objective function is " << objf; - if (objf > best_objf) { - best_objf = objf; - best_nnet = moving_average_nnet; - best_num_to_combine = n + 1; - } + for (int32 n = 1; n < num_nnets; n++) { + ReadKaldiObject(po.GetArg(1 + n), &nnet); + // updates the moving average + UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet); + // evaluates the objective everytime after adding num_to_add model or + // all the models to the moving average. + if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) { + double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode, + egs, moving_average_nnet, &prob_computer); + KALDI_LOG << "Combining last " << n + 1 + << " models, objective function is " << objf; + if (objf > best_objf) { + best_objf = objf; + best_nnet = moving_average_nnet; + best_num_to_combine = n + 1; } } - KALDI_LOG << "Using the model averaged over last " << best_num_to_combine - << " models, objective function is " << best_objf; + } + KALDI_LOG << "Combining " << best_num_to_combine + << " nnets, objective function changed from " << init_objf + << " to " << best_objf; + + if (HasBatchnorm(nnet)) + RecomputeStats(egs, &best_nnet); #if HAVE_CUDA==1 CuDevice::Instantiate().PrintProfile(); #endif - if (HasBatchnorm(nnet)) - RecomputeStats(egs, &best_nnet); - WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write); - } else { - KALDI_LOG << "Copying the single input model directly to the output, " - << "without any combination."; - if (HasBatchnorm(nnet)) - RecomputeStats(egs, &nnet); - WriteKaldiObject(nnet, nnet_wxfilename, binary_write); - } + + WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write); KALDI_LOG << "Finished combining neural nets, wrote model to " << nnet_wxfilename; } catch(const std::exception &e) { From 4e8a53ae5e21ddd7d7a9a51bf936ac80170e32c1 Mon Sep 17 00:00:00 2001 From: freewym Date: Tue, 12 Dec 2017 21:54:31 -0500 Subject: [PATCH 015/184] docs fixes --- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 +---- src/chainbin/nnet3-chain-combine.cc | 21 +++++++++---------- src/nnet3bin/nnet3-combine.cc | 23 +++++++++------------ 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 7312cc09fae..2b4fdd92cec 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -871,10 +871,7 @@ def __init__(self, last-numbered model as the final.mdl).""") self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty", type=float, dest='combine_sum_to_one_penalty', default=0.0, - help="""If > 0, activates 'soft' enforcement of the - sum-to-one penalty in combination (may be helpful - if using dropout). E.g. 1.0e-03. This option is - deprecated and does nothing.""") + help="""This option is deprecated and does nothing.""") self.parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', default=0.0, help="""Momentum used in update computation. diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index ac1b40d29db..7cc341de028 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -28,26 +28,25 @@ namespace kaldi { namespace nnet3 { -// Computes the objective of the moving average of nnet on egs. If either of -// batchnorm/dropout test modes is true, we make a copy of the moving average, -// set test modes on that and evaluate its objective. Note: the object that -// prob_computer->nnet_ refers to should be moving_average_nnet. +// Computes the objective function for the examples in 'egs' given the model in +// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of +// 'nnet', set test modes on that and evaluate its objective. +// Note: the object that prob_computer->nnet_ refers to should be 'nnet'. double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, - const std::vector &egs, - const Nnet &moving_average_nnet, + const std::vector &egs, const Nnet &nnet, const chain::ChainTrainingOptions &chain_config, const fst::StdVectorFst &den_fst, NnetChainComputeProb *prob_computer) { if (batchnorm_test_mode || dropout_test_mode) { - Nnet moving_average_nnet_copy(moving_average_nnet); + Nnet nnet_copy(nnet); if (batchnorm_test_mode) - SetBatchnormTestMode(true, &moving_average_nnet_copy); + SetBatchnormTestMode(true, &nnet_copy); if (dropout_test_mode) - SetDropoutTestMode(true, &moving_average_nnet_copy); + SetDropoutTestMode(true, &nnet_copy); NnetComputeProbOptions compute_prob_opts; NnetChainComputeProb prob_computer_test(compute_prob_opts, chain_config, - den_fst, moving_average_nnet_copy); - return ComputeObjf(false, false, egs, moving_average_nnet_copy, + den_fst, nnet_copy); + return ComputeObjf(false, false, egs, nnet_copy, chain_config, den_fst, &prob_computer_test); } else { prob_computer->Reset(); diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index 220dd663e30..a2eb61d7905 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -28,25 +28,22 @@ namespace kaldi { namespace nnet3 { -// Computes the objective of the moving average of nnet on egs. If either of -// batchnorm/dropout test modes is true, we make a copy of the moving average, -// set test modes on that and evaluate its objective. Note: the object that -// prob_computer->nnet_ refers to should be moving_average_nnet. +// Computes the objective function for the examples in 'egs' given the model in +// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of +// 'nnet', set test modes on that and evaluate its objective. +// Note: the object that prob_computer->nnet_ refers to should be 'nnet'. double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, - const std::vector &egs, - const Nnet &moving_average_nnet, + const std::vector &egs, const Nnet &nnet, NnetComputeProb *prob_computer) { if (batchnorm_test_mode || dropout_test_mode) { - Nnet moving_average_nnet_copy(moving_average_nnet); + Nnet nnet_copy(nnet); if (batchnorm_test_mode) - SetBatchnormTestMode(true, &moving_average_nnet_copy); + SetBatchnormTestMode(true, &nnet_copy); if (dropout_test_mode) - SetDropoutTestMode(true, &moving_average_nnet_copy); + SetDropoutTestMode(true, &nnet_copy); NnetComputeProbOptions compute_prob_opts; - NnetComputeProb prob_computer_test(compute_prob_opts, - moving_average_nnet_copy); - return ComputeObjf(false, false, egs, moving_average_nnet_copy, - &prob_computer_test); + NnetComputeProb prob_computer_test(compute_prob_opts, nnet_copy); + return ComputeObjf(false, false, egs, nnet_copy, &prob_computer_test); } else { prob_computer->Reset(); std::vector::const_iterator iter = egs.begin(), From 1fa9ae85c4bb7270f44c105d6dcff5f9e7fcbf6b Mon Sep 17 00:00:00 2001 From: freewym Date: Tue, 12 Dec 2017 21:57:40 -0500 Subject: [PATCH 016/184] fix --- src/chainbin/nnet3-chain-combine.cc | 6 +++--- src/nnet3bin/nnet3-combine.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index 7cc341de028..ca0428553c1 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -28,9 +28,9 @@ namespace kaldi { namespace nnet3 { -// Computes the objective function for the examples in 'egs' given the model in -// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of -// 'nnet', set test modes on that and evaluate its objective. +// Computes and returns the objective function for the examples in 'egs' given +// the model in 'nnet'. If either of batchnorm/dropout test modes is true, we +// make a copy of 'nnet', set test modes on that and evaluate its objective. // Note: the object that prob_computer->nnet_ refers to should be 'nnet'. double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, const std::vector &egs, const Nnet &nnet, diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc index a2eb61d7905..4bcf4cdfb6d 100644 --- a/src/nnet3bin/nnet3-combine.cc +++ b/src/nnet3bin/nnet3-combine.cc @@ -28,9 +28,9 @@ namespace kaldi { namespace nnet3 { -// Computes the objective function for the examples in 'egs' given the model in -// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of -// 'nnet', set test modes on that and evaluate its objective. +// Computes and returns the objective function for the examples in 'egs' given +// the model in 'nnet'. If either of batchnorm/dropout test modes is true, we +// make a copy of 'nnet', set test modes on that and evaluate its objective. // Note: the object that prob_computer->nnet_ refers to should be 'nnet'. double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, const std::vector &egs, const Nnet &nnet, From 126614ba43c6d2c483e7626d109fbb0d58bb44fd Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 12 Dec 2017 23:00:20 -0500 Subject: [PATCH 017/184] [src] Changes to how MemoryNormComponent behaves (use more up-to-date stats) --- src/nnet3/nnet-normalize-component.cc | 136 +++++++++++--------------- src/nnet3/nnet-normalize-component.h | 54 +++++----- 2 files changed, 82 insertions(+), 108 deletions(-) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index 596ab48b9e5..3e72ca1a152 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -869,14 +869,6 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, Memo *memo = NULL; if (!test_mode_) { memo = GetMemo(in); - if (false) { // temporary. - MemoryNormComponent *temp = new MemoryNormComponent(*this); - temp->StoreStats(in, *out, memo); - Memo *new_memo = temp->GetMemo(in); - delete memo; - memo = new_memo; - delete temp; - } } if (test_mode_) { @@ -884,8 +876,8 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, out->AddVecToRows(-1.0, x_mean); out->MulColsVec(scale); } else { - CuSubVector x_mean(memo->data, 5), - scale(memo->data, 2); + CuSubVector x_mean(memo->data, 0), + scale(memo->data, 4); out->AddVecToRows(-1.0, x_mean); out->MulColsVec(scale); } @@ -895,52 +887,41 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( const CuMatrixBase &in) const { - KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_); + KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_ && + stats_count_ >= 0.0); Memo *memo = new Memo; - int32 num_frames = in.NumRows(); - memo->num_frames = num_frames; - memo->data.Resize(6, block_dim_); - CuSubVector x_sum(memo->data, 0), - x_sumsq(memo->data, 1); - x_sum.AddRowSumMat(1.0, in, 0.0); - x_sumsq.AddDiagMat2(1.0, in, kTrans, 0.0); - if (stats_count_ > 0.0) { - memo->has_indirect_terms = include_indirect_derivative_; - if (include_indirect_derivative_) { - // copy over scale, x_deriv and scale_deriv. - memo->data.RowRange(2, 3).CopyFromMat(data_.RowRange(4, 3)); - } else { - // just copy over the scale. x_deriv and scale_deriv remain zero. - memo->data.Row(2).CopyFromVec(data_.Row(4)); - } - // get 'x_mean' - memo->data.Row(5).CopyFromVec(data_.Row(0)); - } else { - // We should only reach this point on when processing the first - // minibatch of each training job. - - // note: 'x_deriv' and 'scale_deriv' will be zero. This means we're - // ignoring the smaller, indirect term in the derivative for the first - // minibatch of each training job. That indirect term is really not that - // important that we should worry much about this. - memo->has_indirect_terms = false; - - CuSubVector scale(memo->data, 2); - scale.CopyFromVec(x_sumsq); - scale.AddVecVec(-1.0 / (num_frames * 1.0 * num_frames), - x_sum, x_sum, 1.0 / num_frames); - // At this point 'scale' is the variance. - // We apply the floor at 0.0 as a failsafe for problems caused by roundoff. - scale.ApplyFloor(0.0); - scale.Add(epsilon_); - // At this point 'scale' is the variance plus epsilon. - scale.ApplyPow(-0.5); - // OK, now 'scale' is the actual scale: the inverse standard deviation. + BaseFloat old_stats_count = stats_count_, + num_frames = in.NumRows(), + new_stats_count = num_frames + old_stats_count, + old_weight = old_stats_count / new_stats_count; - // get 'x_mean' - CuSubVector x_mean(memo->data, 5); - x_mean.CopyFromVec(x_sum); - x_mean.Scale(1.0 / num_frames); + // The information in 'memo' will be copied to *this when + // StoreStats() is caled (we can't update it in the Propagate() + // function for 'const' reasons). + memo->stats_count = new_stats_count; + memo->backward_count = backward_count_; + memo->data = data_; + + CuSubVector x_mean(memo->data, 0), + x_uvar(memo->data, 1), scale(memo->data, 4); + // Each row of 'in' gets a weight of 1.0 / new_stats_count in the stats. + x_mean.AddRowSumMat(1.0 / new_stats_count, in, old_weight); + x_uvar.AddDiagMat2(1.0 / new_stats_count, in, kTrans, old_weight); + + scale.CopyFromVec(x_uvar); + scale.AddVecVec(-1.0, x_mean, x_mean, 1.0); + // at this point, 'scale' is the variance. + scale.ApplyFloor(0.0); + scale.Add(epsilon_); + scale.ApplyPow(-0.5); + // OK, now 'scale' is the scale. + + if (backward_count_ != 0.0) { + // we have stats 'y_deriv' and 'y_deriv_y' and we need to update the + // quantities x_deriv = y_deriv * scale, and scale_deriv = y_deriv_y * + // scale. + memo->data.RowRange(5, 2).AddMatDiagVec( + 1.0, memo->data.RowRange(2, 2), kNoTrans, scale, 0.0); } return memo; } @@ -993,6 +974,7 @@ void MemoryNormComponent::Backprop( // have the backprop called if the in_deriv is non-NULL. if (test_mode_) { + // In test mode we treat it as a fixed scale and offset. KALDI_ASSERT(memo_in == NULL && stats_count_ != 0.0); // the following is a no-op if in_deriv and out_deriv are the same matrix. in_deriv->CopyFromMat(out_deriv); @@ -1041,11 +1023,12 @@ void MemoryNormComponent::Backprop( in_deriv->CopyFromMat(out_deriv); Memo *memo = static_cast(memo_in); - CuSubVector scale(memo->data, 2); + CuSubVector scale(memo->data, 4); in_deriv->MulColsVec(scale); - if (memo->has_indirect_terms) { - CuSubVector x_deriv(memo->data, 3), - scale_deriv(memo->data, 4); + + if (memo->backward_count != 0.0) { + CuSubVector x_deriv(memo->data, 5), + scale_deriv(memo->data, 6); in_deriv->AddVecToRows(-1.0, x_deriv); in_deriv->AddMatDiagVec(-1.0, out_value, kNoTrans, scale_deriv); } @@ -1090,26 +1073,23 @@ void MemoryNormComponent::StoreStats( // required statistics are already stored in 'memo_in'. Memo *memo = static_cast(memo_in); - BaseFloat num_frames = memo->num_frames, - old_stats_count = stats_count_, - new_stats_count = num_frames + old_stats_count, - old_weight = old_stats_count / new_stats_count; - - // x_mean_and_x_uvar is the first 2 rows of data_. - CuSubMatrix x_mean_and_x_uvar(data_, 0, 2, 0, block_dim_); - // x_sum_and_x_sumsq is the first 2 rows of data_. - CuSubMatrix x_sum_and_x_sumsq(memo->data, 0, 2, 0, block_dim_); - - x_mean_and_x_uvar.Scale(old_weight); - // The factor 1.0 / new_stats_count that appears below can be perhaps more - // clearly written as follows: first define - // new_weight = num_frames / new_stats_count - // and then write 'new_weight / num_frames', which simplifies to - // '1.0 / new_stats_count'. The factor of '1.0 / num_frames' - // is necessary to convert from data sums to a per-frame average. - x_mean_and_x_uvar.AddMat(1.0 / new_stats_count, x_sum_and_x_sumsq); - stats_count_ = new_stats_count; - ComputeDerived(); + // check that the memo's stats count is more than our stats_count_, + // which it should be because the memo should have added extra stats, + // and StoreStats() should be called directly after the Propagate() + // function. + // This could possibly fail with memo_in->stats_count == stats_count_ + // due to roundoff, if you trained with batchnorm-stats-scale set at 1, + // but that would be a poor choice of parameters anyway as + // roundoff would be a big problem. + KALDI_ASSERT(memo->stats_count > stats_count_); + + stats_count_ = memo->stats_count; + // Copying the entire data matrix should be safe because + // StoreStats() is always called directly after the corresponding + // Propagate(), and on the same object; and there should be + // no possibility that other things in this->data changed in + // the interim. + data_.CopyFromMat(memo->data); } void MemoryNormComponent::Read(std::istream &is, bool binary) { diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index bc80421700a..7d6cafa098b 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -400,36 +400,30 @@ class MemoryNormComponent: public Component { private: struct Memo { - // The number of frames (after any reshaping; so in general it will - // be the original NumRows() of the matrix, times dim_ / block_dim_). - int32 num_frames; - // 'data' is of dimension 6 by block_dim_. - // Row 0, which we'll call 'x_sum', is the sum of the rows of the - // input data. - // Row 1, which we'll call 'x_sumsq', is the sum of the rows of the - // elementwise square of the input data matrix. - // Row 2,3,4 are 'scale', 'x_deriv', 'scale_deriv', which - // are just copies of the corresponding values in - // MemoryNormComponent::data_ (from the const nnet, the one we're - // training), and which will have been copied from there when this - // object was created. However if stats_count_ was <= 0 when this - // object was created (first minibatch), then 'scale' - // will be set to the mean and inverse-stddev implied by the stats - // 'sum' and 'sumsq', and 'x_deriv' and 'scale_deriv' will be zero. - // This is so that it does something sensible on the very first - // minibatch we train. The reason why we copy these quantities here - // is because in the backprop phase we feel it would be better to - // use the same values that were used in the forward propagation, - // instead of the possibly-updated values that might exist when - // Backprop() is called. It's actually not clear whether this is - // necessary. - // Row 5 ('x_mean') is a copy of the data mean the data wasnormalized with. + // 'stats_count' is the same as stats_count_ in the MemoryNormComponent + // from whose Propagate() function this memo was generated, plus + // the number of frames we're propagating (this is after any reshaping + // if block_dim_ != dim_). + BaseFloat stats_count; + + // 'stats_count' is the same as stats_count_ in the MemoryNormComponent + // from whose Propagate() function this memo was generated. It's mainly + // included because the backprop code wants to see if this was nonzero. + BaseFloat backward_count; + + // The structure of 'data' is the same as the data_ member of + // MemoryNormComponent; it's a matrix of dimension 7 by block_dim_. + // It will differ from the data_ member of the component we generated this + // from by the addition of some extra data in the 'x_sum' and 'x_sumsq' + // stats, and a corresponding modification of the 'scale', 'x_deriv' + // and 'scale_deriv' quantities. + // + // (note: the reason we update the stats before propagation rather + // than after, is for stability: otherwise, with relu units, if we only + // update the stats after the propagation we get a particular pathology: if + // a unit was previously always zero it will get a big scale; and if then we + // start getting some nonzero output, the scale on it will be too large.) CuMatrix data; - - // This is set to true if we have the 'indirect' terms in the derivative, - // relating to the 'x_deriv' and 'scale_deriv' terms in 'data'. If false, - // we save some computation. - bool has_indirect_terms; }; @@ -503,7 +497,7 @@ class MemoryNormComponent: public Component { // We store data_ as a single matrix because it enables certain operations // to be done using fewer kernels, but it contains various different quantities, // which we'll describe below as if they were separate variables. - // data_ is of dimension 6 by block_dim_. + // data_ is of dimension 7 by block_dim_. CuMatrix data_; // data_.Row(0) is 'x_mean', which is the decaying moving-average of // input data x; or zero if stats_count_ is zero. From 81335d085f3e9c9b87aa71483e0ed3140446eaa3 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 13 Dec 2017 23:08:44 -0500 Subject: [PATCH 018/184] [src] Simplify NormalizeLayer. --- src/nnet3/nnet-normalize-component.cc | 97 +++++++++++++-------------- src/nnet3/nnet-normalize-component.h | 15 ++--- 2 files changed, 52 insertions(+), 60 deletions(-) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index 3e72ca1a152..f305c196504 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -714,7 +714,7 @@ void BatchNormComponent::ZeroStats() { We can compute: mean = sum / count var = epsilon + (sumsq / count) - (mean * mean) - scale = var^{-0.5} + scale = target_rms * var^{-0.5} y(i) = (x(i) - mean) * scale. @@ -729,8 +729,11 @@ void BatchNormComponent::ZeroStats() { mean' = -scale * \sum_i w(i) y'(i) scale' = \sum_i w(i) y'(i) (x(i) - mean) = 1/scale \sum_i w(i) y'(i) y(i) - var' = -0.5 var^{-1.5} scale' - = -0.5 var^{-1} \sum_i w(i) y'(i) y(i) + var' = -0.5 target_rms var^{-1.5} scale' + = -0.5 target_rms var^{-1.5} (1/scale) \sum_i w(i) y'(i) y(i) + .. and using 1/scale = var^{0.5}/target_rms, + = -0.5 var^{-1} \sum_i w(i) y'(i) y(i) (*) + It will be convenient to write down 'per-frame' versions of all of these quantities, which are divided by the total count: @@ -752,16 +755,23 @@ void BatchNormComponent::ZeroStats() { x'(i) = y'(i)*scale + mean_norm' + 2 var_norm' (x(i) - mean) = y'(i)*scale + mean_norm' + 2 var_norm' y(i) / scale - = y'(i)*scale + mean_norm' - y(i) * scale/count * \sum_i w(i) y'(i) y(i) - - I'm afraid I just pulled the above out of thin air... needs some more - derivation. The part about (x(i) - mean) can be obtained, I believe, - from computation of the derivative of the variance w.r.t. the x(i) values. - + ... and substituting in the equation (*) above for var', using var_norm' = var'/scale, + and rearranging slightly: + = y'(i)*scale + mean_norm' - y(i) * var^{-1}/scale * 1/count * \sum_i w(i) y'(i) y(i) + .. and using scale=target-rms * var^{-0.5}, so var^{-1}/scale = var^{-0.5}/target-rms = scale/target-rms^2: + = y'(i)*scale + mean_norm' - y(i) * scale/(count*target-rms^2) * \sum_i w(i) y'(i) y(i) + .. and considering that the factor of 'scale' appears (directly or indirectly) in all 3 + of the terms in the above expression, we can reorganize this as: + = scale * (y'(i) - 1/count*\sum_i w(i)*y(i) - 1/(count*target-rms^2) * \sum_i w(i) y'(i) y(i)) */ void MemoryNormComponent::SetTestMode(bool test_mode) { + if (test_mode && stats_count_ <= 0) { + KALDI_WARN << "Refusing to set test-mode in MemoryNormComponent since no " + "stats are present."; + return; + } test_mode_ = test_mode; } @@ -795,16 +805,13 @@ std::string MemoryNormComponent::Info() const { if (stats_count_ > 0.0) { CuSubVector x_mean(data_, 0), y_deriv(data_, 2), y_deriv_y(data_, 3), - scale(data_, 4), x_deriv(data_, 5), - scale_deriv(data_, 6); + scale(data_, 4); if (stats_count_ > 0.0) stream << ", x-mean=" << SummarizeVector(x_mean) << ", scale=" << SummarizeVector(scale); if (backward_count_ > 0.0) stream << ", y-deriv=" << SummarizeVector(y_deriv) - << ", y-deriv-y=" << SummarizeVector(y_deriv_y) - << ", x-deriv=" << SummarizeVector(x_deriv) - << ", scale-deriv=" << SummarizeVector(scale_deriv); + << ", y-deriv-y=" << SummarizeVector(y_deriv_y); } return stream.str(); } @@ -836,7 +843,7 @@ void MemoryNormComponent::InitFromConfig(ConfigLine *cfl) { << cfl->UnusedValues(); stats_count_ = 0.0; backward_count_ = 0.0; - data_.Resize(7, block_dim_); + data_.Resize(5, block_dim_); } @@ -862,7 +869,7 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, out->CopyFromMat(in); if (test_mode_ && stats_count_ <= 0.0) - KALDI_ERR << "Test mode set but no stats available."; + KALDI_ERR << "Test mode set but no stats available."; // From this point, we can assume that the num-cols of 'in' and 'out' // equals block_dim_. @@ -909,20 +916,15 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( x_uvar.AddDiagMat2(1.0 / new_stats_count, in, kTrans, old_weight); scale.CopyFromVec(x_uvar); - scale.AddVecVec(-1.0, x_mean, x_mean, 1.0); + // we save a CUDA operation by applying the scale 'target_rms_scale' before doing + // ApplyPow(-0.5), and this requires taking it to the power -2. + BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_); + scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale); // at this point, 'scale' is the variance. scale.ApplyFloor(0.0); - scale.Add(epsilon_); + scale.Add(epsilon_ * target_rms_scale); scale.ApplyPow(-0.5); // OK, now 'scale' is the scale. - - if (backward_count_ != 0.0) { - // we have stats 'y_deriv' and 'y_deriv_y' and we need to update the - // quantities x_deriv = y_deriv * scale, and scale_deriv = y_deriv_y * - // scale. - memo->data.RowRange(5, 2).AddMatDiagVec( - 1.0, memo->data.RowRange(2, 2), kNoTrans, scale, 0.0); - } return memo; } @@ -1013,8 +1015,8 @@ void MemoryNormComponent::Backprop( to_update->backward_count_ = new_backward_count; // We don't bother calling to_update->ComputeDerived()-- although it would // be harmless-- because in the current situations where this code is - // reached, to_update will be the delta_nnet_, and the derived parameters of - // delta_nnet_ aren't used. + // reached, to_update will be the delta_nnet_, and the derived parameter + // 'scale') of delta_nnet_ aren't used. // to_update->ComputeDerived(); } @@ -1023,42 +1025,37 @@ void MemoryNormComponent::Backprop( in_deriv->CopyFromMat(out_deriv); Memo *memo = static_cast(memo_in); + if (memo->backward_count != 0.0) { + CuSubVector y_deriv(memo->data, 2), + y_deriv_y(memo->data, 3); + in_deriv->AddVecToRows(-1.0, y_deriv); + in_deriv->AddMatDiagVec(-1.0 / (target_rms_ * target_rms_), + out_value, kNoTrans, y_deriv_y); + } CuSubVector scale(memo->data, 4); in_deriv->MulColsVec(scale); - if (memo->backward_count != 0.0) { - CuSubVector x_deriv(memo->data, 5), - scale_deriv(memo->data, 6); - in_deriv->AddVecToRows(-1.0, x_deriv); - in_deriv->AddMatDiagVec(-1.0, out_value, kNoTrans, scale_deriv); - } } void MemoryNormComponent::ComputeDerived() { - KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 7); + KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 5); if (stats_count_ == 0.0) { - // zero 'scale', 'x_deriv' and 'scale_deriv'. - data_.RowRange(4, 3).SetZero(); + // zero 'scale'. + data_.Row(4).SetZero(); return; } CuSubVector x_mean(data_, 0), x_uvar(data_, 1), - y_deriv(data_, 2), y_deriv_y(data_, 3), scale(data_, 4); + scale(data_, 4); scale.CopyFromVec(x_uvar); - scale.AddVecVec(-1.0, x_mean, x_mean, 1.0); - // at this point, 'scale' is the variance. + // we save a CUDA operation by applying the scale 'target_rms_scale' before doing + // ApplyPow(-0.5), and this requires taking it to the power -2. + BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_); + scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale); + // at this point, 'scale' is the variance (divided by target_rms^2). scale.ApplyFloor(0.0); - scale.Add(epsilon_); + scale.Add(epsilon_ * target_rms_scale); scale.ApplyPow(-0.5); - if (backward_count_ == 0.0) { - // The following statement sets x_deriv and scale_deriv to zero. - data_.RowRange(5, 2).SetZero(); - } else { - // The following statement sets x_deriv = y_deriv * scale, - // and scale_deriv = y_deriv_y * scale. - data_.RowRange(5, 2).AddMatDiagVec(1.0, - data_.RowRange(2, 2), kNoTrans, scale, 0.0); - } } void MemoryNormComponent::StoreStats( diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index 7d6cafa098b..e32ad549c28 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -164,6 +164,8 @@ class BatchNormComponent: public Component { // accumulate these stats; they are stored as a matter of course on each // iteration of training, as for NonlinearComponents, and we'll use the stats // from the most recent [script-level] iteration. + // (Note: it will refuse to actually set test-mode to true if there + // are no stats stored.) void SetTestMode(bool test_mode); // constructor using another component @@ -412,7 +414,7 @@ class MemoryNormComponent: public Component { BaseFloat backward_count; // The structure of 'data' is the same as the data_ member of - // MemoryNormComponent; it's a matrix of dimension 7 by block_dim_. + // MemoryNormComponent; it's a matrix of dimension 5 by block_dim_. // It will differ from the data_ member of the component we generated this // from by the addition of some extra data in the 'x_sum' and 'x_sumsq' // stats, and a corresponding modification of the 'scale', 'x_deriv' @@ -497,7 +499,7 @@ class MemoryNormComponent: public Component { // We store data_ as a single matrix because it enables certain operations // to be done using fewer kernels, but it contains various different quantities, // which we'll describe below as if they were separate variables. - // data_ is of dimension 7 by block_dim_. + // data_ is of dimension 5 by block_dim_. CuMatrix data_; // data_.Row(0) is 'x_mean', which is the decaying moving-average of // input data x; or zero if stats_count_ is zero. @@ -511,18 +513,11 @@ class MemoryNormComponent: public Component { // objective w.r.t. the output); or zero if backward_count_ // is zero. // - // The quantities below are derived from the stats above. + // The quantity below is derived from the stats above. // // data_.Row(4) is 'scale', which is the inverse square root of the // covariance computed from x_mean and x_uvar (plus epsilon), // or zero if stats_count_ is zero. - // data_.Row(5) is 'x_deriv', which is the negative of the average derivative - // (per frame) of the objective function w.r.t the input x (just the - // part that comes via the derivative w.r.t. the x mean). - // 'x_deriv' equals 'y_deriv' times 'scale'. - // data_.Row(6) is 'scale_deriv', which relates to the part of the - // derivative w.r.t. the input that comes from the objf - // derivative w.r.t. the scale. It equals scale * y_deriv_y. }; From aefb72eb8cd8664415d08dad7313d0d4170446ab Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 14 Dec 2017 22:12:38 -0500 Subject: [PATCH 019/184] [src] Simplify MemoryNormComponent and always store stats during backprop on nnet_to_store_stats_ --- src/nnet3/nnet-compute.cc | 20 +++++++++++++++----- src/nnet3/nnet-normalize-component.cc | 27 ++++++++++++++------------- src/nnet3/nnet-normalize-component.h | 4 ++-- src/nnet3/nnet-test-utils.cc | 10 ---------- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 31d60e1ce4a..12a4ec65ae9 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -261,11 +261,21 @@ void NnetComputer::ExecuteCommand() { debug_str << nnet_.GetComponentName(c.arg1); const Component *component = nnet_.GetComponent(c.arg1); KALDI_ASSERT(!(computation_.need_model_derivative && !nnet_to_update_)); - Component *upd_component = (nnet_to_update_ && - c.command_type == kBackprop && - computation_.need_model_derivative ? - nnet_to_update_->GetComponent(c.arg1) : - NULL); + Component *upd_component = NULL; + if (c.command_type == kBackprop) { // this block sets 'upd_component' + Nnet *nnet_to_update; + if (component->Properties()&kUpdatableComponent) { + nnet_to_update = (computation_.need_model_derivative ? + nnet_to_update_ : NULL); + } else { + // Some non-updatable components, such as CompositeComponent and + // MemoryNormComponent, store stats in the backprop. For other + // types of component, this arg won't matter. + nnet_to_update = nnet_to_store_stats_; + } + if (nnet_to_update) + upd_component = nnet_to_update->GetComponent(c.arg1); + } ComponentPrecomputedIndexes *indexes = computation_.component_precomputed_indexes[c.arg2].data; const CuSubMatrix in_value(GetSubMatrix(c.arg3)); diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index f305c196504..1e3314bf91f 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -903,7 +903,7 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( old_weight = old_stats_count / new_stats_count; // The information in 'memo' will be copied to *this when - // StoreStats() is caled (we can't update it in the Propagate() + // StoreStats() is called (we can't update it in the Propagate() // function for 'const' reasons). memo->stats_count = new_stats_count; memo->backward_count = backward_count_; @@ -1013,28 +1013,25 @@ void MemoryNormComponent::Backprop( y_deriv_y.AddDiagMatMat(1.0 / new_backward_count, out_deriv, kTrans, out_value, kNoTrans, old_weight); to_update->backward_count_ = new_backward_count; - // We don't bother calling to_update->ComputeDerived()-- although it would - // be harmless-- because in the current situations where this code is - // reached, to_update will be the delta_nnet_, and the derived parameter - // 'scale') of delta_nnet_ aren't used. - // to_update->ComputeDerived(); + // Now 'to_update' will typically be the same as 'this', so we need + // to compute the derived parameters because it affects some code that's + // below. + to_update->ComputeDerived(); } // the following does no work if in_deriv and out_deriv are the same matrix. in_deriv->CopyFromMat(out_deriv); - Memo *memo = static_cast(memo_in); - if (memo->backward_count != 0.0) { - CuSubVector y_deriv(memo->data, 2), - y_deriv_y(memo->data, 3); + if (this->backward_count_ != 0.0) { + CuSubVector y_deriv(data_, 2), + y_deriv_y(data_, 3); in_deriv->AddVecToRows(-1.0, y_deriv); in_deriv->AddMatDiagVec(-1.0 / (target_rms_ * target_rms_), out_value, kNoTrans, y_deriv_y); } - CuSubVector scale(memo->data, 4); + CuSubVector scale(data_, 4); in_deriv->MulColsVec(scale); - } @@ -1084,7 +1081,7 @@ void MemoryNormComponent::StoreStats( // Copying the entire data matrix should be safe because // StoreStats() is always called directly after the corresponding // Propagate(), and on the same object; and there should be - // no possibility that other things in this->data changed in + // no possibility that other things in this->data_ changed in // the interim. data_.CopyFromMat(memo->data); } @@ -1169,6 +1166,10 @@ void MemoryNormComponent::Add(BaseFloat alpha, const Component &other_in) { return; } + if (alpha * other->stats_count_ == 0.0 && + alpha * other->backward_count_ == 0.0) + return; + BaseFloat new_stats_count = stats_count_ + alpha * other->stats_count_, new_backward_count = backward_count_ + alpha * other->backward_count_; diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index e32ad549c28..5299862ee65 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -296,11 +296,11 @@ class BatchNormComponent: public Component { /* - MemoryNormComponent - MemoryNormComponent is like batch normalization, except the stats are accumulated as a weighted sum over past minibatches (if this is not the first minibatch), instead of over the current minibatch. + Caution: we don't test this component in the standard way because it + would fail the derivative tests. You can use it in the same way you would normally use BatchNormComponent. diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 6ed0b6f9191..83b902a9b90 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1711,16 +1711,6 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " learning-rate=" << learning_rate; break; } - /* case 35: { - *component_type = "MemoryNormComponent"; - int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2); - os << " dim=" << dim - << " block-dim=" << block_dim << " target-rms=" - << RandInt(1, 2) << " include-indirect-derivative=" - << (RandInt(0, 1) == 0 ? "true" : "false") - << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0"); - break; - }*/ default: KALDI_ERR << "Error generating random component"; } From 8fae6ae7319003b942cfe7b316aa48d03c9eeb86 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 15 Dec 2017 02:05:54 -0500 Subject: [PATCH 020/184] [scripts] Script fix; update batchnorm(/memnorm) stats faster in first part of training. --- .../steps/libs/nnet3/train/chain_objf/acoustic_model.py | 9 ++++++++- egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 5b640510ea1..78993bca217 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -173,13 +173,19 @@ def train_new_models(dir, iter, srand, num_jobs, (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "")) + # For the first epoch (at most the first 15 iters), scale the batchnorm stats + # down more aggressively. This affects memory-norm components. + batchnorm_opt=("--batchnorm-stats-scale=0.5" + if num_archives_processed < (num_archives * frame_subsampling_factor) and iter < 15 + else "") + thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ --apply-deriv-weights={app_deriv_wts} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ {cache_io_opts} --xent-regularize={xent_reg} \ - {deriv_time_opts} \ + {deriv_time_opts} {batchnorm_opt} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ @@ -199,6 +205,7 @@ def train_new_models(dir, iter, srand, num_jobs, dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1, job=job, deriv_time_opts=" ".join(deriv_time_opts), + batchnorm_opt=batchnorm_opt, app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 483883fdee4..c8a71e15672 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -805,7 +805,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities): elif nonlinearity == 'memnorm': line = ('component name={0}.{1}' ' type=MemoryNormComponent dim={2}' - ' target-rms={3} include-indirect-derivative=false ' + ' target-rms={3} ' ''.format(self.name, nonlinearity, output_dim, target_rms)) From 6148dccd9f5a9ac31cebbd5a9cf6c7bf6346861a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 15 Dec 2017 17:42:19 -0500 Subject: [PATCH 021/184] [src,scripts] Add extra nnet3 diagnostics; add reothonormalize option for LinearComponent --- .../nnet3/train/chain_objf/acoustic_model.py | 26 ++++++- .../nnet3/train/frame_level_objf/common.py | 23 ++++++ egs/wsj/s5/steps/make_phone_graph.sh | 1 + egs/wsj/s5/utils/dict_dir_add_pronprobs.sh | 18 ++++- .../s5/utils/lang/make_phone_bigram_lang.sh | 4 +- src/nnet3/nnet-chain-training.cc | 19 +++++ src/nnet3/nnet-convolutional-component.cc | 38 +++++----- src/nnet3/nnet-parse.cc | 27 ++++++- src/nnet3/nnet-parse.h | 23 +++++- src/nnet3/nnet-simple-component.cc | 47 +++++++++--- src/nnet3/nnet-simple-component.h | 30 ++++++-- src/nnet3/nnet-training.cc | 19 +++++ src/nnet3/nnet-utils.cc | 73 +++++++++++++++++++ src/nnet3/nnet-utils.h | 12 ++- src/nnet3bin/nnet3-show-progress.cc | 13 +++- 15 files changed, 315 insertions(+), 58 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 02a3b4c75d5..9b424a5e384 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -480,14 +480,34 @@ def compute_progress(dir, iter, run_opts): common_lib.background_command( """{command} {dir}/log/progress.{iter}.log \ nnet3-am-info {model} '&&' \ - nnet3-show-progress --use-gpu=no \ - "nnet3-am-copy --raw=true {prev_model} - |" \ - "nnet3-am-copy --raw=true {model} - |" + nnet3-show-progress --use-gpu=no {prev_model} {model} """.format(command=run_opts.command, dir=dir, iter=iter, model=model, prev_model=prev_model)) + if iter % 10 == 0 and iter > 0: + # Every 10 iters, print some more detailed information. + # full_progress.X.log contains some diagnostics of the difference in + # parameters, printed in the same format as from nnet3-info. + common_lib.background_command( + """{command} {dir}/log/full_progress.{iter}.log \ + nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model)) + # full_info.X.log is just the nnet3-info of the model, with the --verbose=2 + # option which includes stats on the singular values of the parameter matrices. + common_lib.background_command( + """{command} {dir}/log/full_info.{iter}.log \ + nnet3-info --verbose=2 {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model)) + def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index f8a69c5ad84..9c09394ccb4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -447,6 +447,29 @@ def compute_progress(dir, iter, egs_dir, ''.format(command=run_opts.command, dir=dir, iter=iter, model=model, prev_model=prev_model)) + if iter % 10 == 0 and iter > 0: + # Every 10 iters, print some more detailed information. + # full_progress.X.log contains some diagnostics of the difference in + # parameters, printed in the same format as from nnet3-info. + common_lib.background_command( + """{command} {dir}/log/full_progress.{iter}.log \ + nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model)) + # full_info.X.log is just the nnet3-info of the model, with the --verbose=2 + # option which includes stats on the singular values of the parameter matrices. + common_lib.background_command( + """{command} {dir}/log/full_info.{iter}.log \ + nnet3-info --verbose=2 {model} + """.format(command=run_opts.command, + dir=dir, + iter=iter, + model=model)) + + def combine_models(dir, num_iters, models_to_combine, egs_dir, minibatch_size_str, diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh index 817f7d1f10b..aaf88cc66d2 100755 --- a/egs/wsj/s5/steps/make_phone_graph.sh +++ b/egs/wsj/s5/steps/make_phone_graph.sh @@ -8,6 +8,7 @@ # is to be used for segmentation, and uses that together with a model to # make a decoding graph. # Uses SRILM. +# See also utils/lang/make_phone_bigram_lm.sh. # Begin configuration section. stage=0 diff --git a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh index 50191cf90cb..59ae4a4c994 100755 --- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh +++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh @@ -6,6 +6,11 @@ # 2015 Hainan Xu +# The thing that this script implements is described in the paper: +# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR" +# by Guoguo Chen et al, see +# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf + . ./path.sh || exit 1; # begin configuration @@ -73,7 +78,7 @@ fi # the cat and awk commands below are implementing add-one smoothing. cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \ awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; } - END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; + END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; print num / den, p } } ' | \ awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^' |\ sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt @@ -108,6 +113,11 @@ fi # Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file # exists. The format of $dir/lexiconp_silprob.txt is: # word pron-prob P(s_r | w) F(s_l | w) F(n_l | w) pron +# where: P(s_r | w) is the probability of silence to the right of the word +# F(s_l | w) is a factor which is greater than one if silence to the +# left of the word is more than averagely probable. +# F(n_l | w) is a factor which is greater than one if nonsilence to the +# left of the word is more than averagely probable. if [ -n "$sil_counts" ]; then if [ ! -s "$sil_counts" ]; then echo "$0: expected file $sil_counts to exist and not empty" && exit 1; @@ -175,7 +185,7 @@ if [ -n "$sil_counts" ]; then # Computes F(s_l | w) and F(n_l | w) in the paper. $lambda3 = 2; # Smoothing term, \lambda_3 in the paper. foreach my $wpron (keys %all_wprons) { - @col = split(" ", $wpron); + @col = split(" ", $wpron); $word = shift @col; $pron = join(" ", @col); $pron_prob = $all_wprons{$wpron}; @@ -189,7 +199,7 @@ if [ -n "$sil_counts" ]; then print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n"; } - + # Create silprob.txt $BOS_sil_count = $wpron_sil{""} + $sil_prob * $lambda2; $BOS_nonsil_count = $wpron_nonsil{""} + (1 - $sil_prob) * $lambda2; @@ -206,7 +216,7 @@ if [ -n "$sil_counts" ]; then fi # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are -# in the same order. +# in the same order. cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/ / /g' >$dir/lexicon.txt diff --git a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh index dcb77bb1342..1d3d04896b4 100755 --- a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh +++ b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh @@ -9,10 +9,10 @@ # is to limit the number of transitions, so we can decode reasonably fast, and the # graph won't blow up. This is probably going to be most useful for things like # language-id. +# +# See also steps/make_phone_graph.sh -# We might later have options here; if not, I'll emove this. - echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 780a7115a8a..b24e81e7494 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -119,6 +119,10 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, // happens when we use the model with batchnorm test-mode set). ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + // The following will only do something if we have a LinearComponent + // with is-constrained-orthonormal set to true. + ConstrainOrthonormal(nnet_); + // Scale delta_nnet if (success) ScaleNnet(nnet_config.momentum, delta_nnet_); @@ -167,6 +171,21 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); + if (is_backstitch_step1) { + // The following will only do something if we have a LinearComponent + // with is-constrained-orthonormal set to true. We choose to do this + // only on the 1st backstitch step, for efficiency. + ConstrainOrthonormal(nnet_); + } + + if (!is_backstitch_step1) { + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). Do this + // after backstitch step 2 so that the stats are scaled down before we start + // the next minibatch. + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + } + ScaleNnet(0.0, delta_nnet_); } diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index f689984e876..333d7a79cfa 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -263,18 +263,14 @@ void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("alpha-out", &alpha_out); cfl->GetValue("num-minibatches-history", &num_minibatches_history); - preconditioner_in_.SetAlpha(alpha_in); - preconditioner_out_.SetAlpha(alpha_out); int32 dim_in = linear_params_.NumCols() + 1, dim_out = linear_params_.NumRows(); - if (rank_in < 0) { + if (rank_in < 0) rank_in = std::min(80, (dim_in + 1) / 2); - preconditioner_in_.SetRank(rank_in); - } - if (rank_out < 0) { + preconditioner_in_.SetRank(rank_in); + if (rank_out < 0) rank_out = std::min(80, (dim_out + 1) / 2); - preconditioner_out_.SetRank(rank_out); - } + preconditioner_out_.SetRank(rank_out); preconditioner_in_.SetNumMinibatchesHistory(num_minibatches_history); preconditioner_out_.SetNumMinibatchesHistory(num_minibatches_history); @@ -360,29 +356,29 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient( const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { - CuVector bias_temp(bias_params_.Dim()); + CuVector bias_deriv(bias_params_.Dim()); - { // this block computes 'bias_temp', the derivative w.r.t. the bias. + { // this block computes 'bias_deriv', the derivative w.r.t. the bias. KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() && out_deriv.NumCols() == model_.height_out * model_.num_filters_out); CuSubMatrix out_deriv_reshaped( out_deriv.Data(), out_deriv.NumRows() * model_.height_out, model_.num_filters_out, model_.num_filters_out); - bias_temp.AddRowSumMat(1.0, out_deriv_reshaped); + bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped); } - CuMatrix params_temp(linear_params_.NumRows(), + CuMatrix params_deriv(linear_params_.NumRows(), linear_params_.NumCols() + 1); - params_temp.CopyColFromVec(bias_temp, linear_params_.NumCols()); + params_deriv.CopyColFromVec(bias_deriv, linear_params_.NumCols()); - CuSubMatrix linear_params_temp( - params_temp, 0, linear_params_.NumRows(), + CuSubMatrix linear_params_deriv( + params_deriv, 0, linear_params_.NumRows(), 0, linear_params_.NumCols()); ConvolveBackwardParams(indexes.computation, in_value, out_deriv, - 1.0, &linear_params_temp); + 1.0, &linear_params_deriv); // the precondition-directions code outputs a scalar that // must be multiplied by its output (this saves one @@ -393,22 +389,22 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient( // scalars are different across iterations, the scalars // will be pretty similar on different iterations BaseFloat scale1, scale2; - preconditioner_in_.PreconditionDirections(¶ms_temp, NULL, + preconditioner_in_.PreconditionDirections(¶ms_deriv, NULL, &scale1); - CuMatrix params_temp_transpose(params_temp, kTrans); - preconditioner_out_.PreconditionDirections(¶ms_temp_transpose, + CuMatrix params_deriv_transpose(params_deriv, kTrans); + preconditioner_out_.PreconditionDirections(¶ms_deriv_transpose, NULL, &scale2); linear_params_.AddMat( learning_rate_ * scale1 * scale2, - params_temp_transpose.RowRange(0, linear_params_.NumCols()), + params_deriv_transpose.RowRange(0, linear_params_.NumCols()), kTrans); bias_params_.AddVec(learning_rate_ * scale1 * scale2, - params_temp_transpose.Row(linear_params_.NumCols())); + params_deriv_transpose.Row(linear_params_.NumCols())); } diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index 2c4da825013..37d44a89673 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -537,7 +537,10 @@ void PrintParameterStats(std::ostringstream &os, void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuMatrix ¶ms, - bool include_mean) { + bool include_mean, + bool include_row_norms, + bool include_column_norms, + bool include_singular_values) { os << std::setprecision(4); os << ", " << name << '-'; int32 dim = params.NumRows() * params.NumCols(); @@ -551,8 +554,26 @@ void PrintParameterStats(std::ostringstream &os, os << "rms=" << rms; } os << std::setprecision(6); // restore the default precision. - if (GetVerboseLevel() >= 2) { - // At verbose level >= 2, print stats of the singular values of the matrix. + + if (include_row_norms) { + CuVector row_norms(params.NumRows()); + row_norms.AddDiagMat2(1.0, params, kNoTrans, 0.0); + row_norms.ApplyPow(0.5); + Vector row_norms_cpu; + row_norms.Swap(&row_norms_cpu); + os << ", " << name << "-row-norms=" + << SummarizeVector(row_norms_cpu); + } + if (include_column_norms) { + CuVector col_norms(params.NumCols()); + col_norms.AddDiagMat2(1.0, params, kTrans, 0.0); + col_norms.ApplyPow(0.5); + Vector col_norms_cpu; + col_norms.Swap(&col_norms_cpu); + os << ", " << name << "-col-norms=" + << SummarizeVector(col_norms_cpu); + } + if (include_singular_values) { Matrix params_cpu(params); Vector s(std::min(params.NumRows(), params.NumCols())); params_cpu.Svd(&s); diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index fef21301ff6..83e36d37c0b 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -189,8 +189,11 @@ std::string ErrorContext(std::istream &is); std::string ErrorContext(const std::string &str); -// Returns a string that summarizes a vector fairly succintly, for -// printing stats in info lines. +/** Returns a string that summarizes a vector fairly succintly, for + printing stats in info lines. For example: + "[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.003,0.003,0.004 \ + 0.005,0.01,0.07,0.11,0.14 0.18,0.24,0.29,0.39), mean=0.0745, stddev=0.0611]" +*/ std::string SummarizeVector(const Vector &vec); /** Print to 'os' some information about the mean and standard deviation of @@ -213,13 +216,25 @@ void PrintParameterStats(std::ostringstream &os, PrintParameterStats(os, "linear-params", linear_params_; would print to 'os' something like the string ", linear-params-rms=0.239". - If you set include_mean to true, it will print something like + If you set 'include_mean' to true, it will print something like ", linear-params-{mean-stddev}=0.103,0.183". + If you set 'include_row_norms' to true, it will print something + like + ", linear-params-row-norms=[percentiles(0,1........, stddev=0.0508]" + If you set 'include_column_norms' to true, it will print something + like + ", linear-params-col-norms=[percentiles(0,1........, stddev=0.0508]" + If you set 'include_singular_values' to true, it will print something + like + ", linear-params-singular-values=[percentiles(0,1........, stddev=0.0508]" */ void PrintParameterStats(std::ostringstream &os, const std::string &name, const CuMatrix ¶ms, - bool include_mean = false); + bool include_mean = false, + bool include_row_norms = false, + bool include_column_norms = false, + bool include_singular_values = false); } // namespace nnet3 diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index d6c4e2163bf..471e7d943d4 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -1229,7 +1229,11 @@ void AffineComponent::PerturbParams(BaseFloat stddev) { std::string AffineComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); - PrintParameterStats(stream, "linear-params", linear_params_); + PrintParameterStats(stream, "linear-params", linear_params_, + false, // include_mean + true, // include_row_norms + true, // include_column_norms + GetVerboseLevel() >= 2); // include_singular_values PrintParameterStats(stream, "bias", bias_params_, true); return stream.str(); } @@ -2100,12 +2104,6 @@ void PerElementScaleComponent::Backprop( PerElementScaleComponent *to_update = dynamic_cast(to_update_in); - if (in_deriv) { - // Propagate the derivative back to the input. - in_deriv->CopyFromMat(out_deriv); - in_deriv->MulColsVec(scales_); - } - if (to_update != NULL) { // Next update the model (must do this 2nd so the derivatives we propagate // are accurate, in case this == to_update_in.) @@ -2114,6 +2112,13 @@ void PerElementScaleComponent::Backprop( else // the call below is to a virtual function that may be re-implemented to_update->Update(debug_info, in_value, out_deriv); // by child classes. } + + if (in_deriv) { + // Propagate the derivative back to the input. + if (in_deriv->Data() != out_deriv.Data()) + in_deriv->CopyFromMat(out_deriv); + in_deriv->MulColsVec(scales_); + } } void PerElementScaleComponent::Read(std::istream &is, bool binary) { @@ -2968,9 +2973,7 @@ void NaturalGradientAffineComponent::Write(std::ostream &os, std::string NaturalGradientAffineComponent::Info() const { std::ostringstream stream; - stream << UpdatableComponent::Info(); - PrintParameterStats(stream, "linear-params", linear_params_); - PrintParameterStats(stream, "bias", bias_params_, true); + stream << AffineComponent::Info(); stream << ", rank-in=" << rank_in_ << ", rank-out=" << rank_out_ << ", num-samples-history=" << num_samples_history_ @@ -3072,6 +3075,12 @@ void LinearComponent::Read(std::istream &is, bool binary) { KALDI_ASSERT(token == ""); ExpectToken(is, binary, ""); params_.Read(is, binary); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + } else { + orthonormal_constraint_ = 0.0; + } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &use_natural_gradient_); @@ -3149,6 +3158,10 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); + + orthonormal_constraint_ = 0.0; + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); @@ -3160,6 +3173,10 @@ void LinearComponent::Write(std::ostream &os, WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate WriteToken(os, binary, ""); params_.Write(os, binary); + if (orthonormal_constraint_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, use_natural_gradient_); @@ -3183,7 +3200,13 @@ void LinearComponent::Write(std::ostream &os, std::string LinearComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); - PrintParameterStats(stream, "params", params_); + PrintParameterStats(stream, "params", params_, + false, // include_mean + true, // include_row_norms + true, // include_column_norms + GetVerboseLevel() >= 2); // include_singular_values + if (orthonormal_constraint_ != 0.0) + stream << ", orthonormal-constraint=" << orthonormal_constraint_; stream << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false") << ", rank-in=" << preconditioner_in_.GetRank() @@ -3249,12 +3272,14 @@ LinearComponent::LinearComponent( const LinearComponent &other): UpdatableComponent(other), params_(other.params_), + orthonormal_constraint_(other.orthonormal_constraint_), use_natural_gradient_(other.use_natural_gradient_), preconditioner_in_(other.preconditioner_in_), preconditioner_out_(other.preconditioner_out_) { } LinearComponent::LinearComponent(const CuMatrix ¶ms): params_(params), + orthonormal_constraint_(0.0), use_natural_gradient_(true) { // Set defaults for natural gradient. preconditioner_in_.SetRank(40); diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index d7cece06284..099d0c8fa2a 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -919,6 +919,16 @@ class NaturalGradientAffineComponent: public AffineComponent { bias-stddev, bias-mean) to initialize the parameters. Dimension is output-dim by (input-dim + 1), last column is interpreted as the bias. + orthonormal-constraint=0.0 If you set this to 1.0, then + this matrix will be (approximately) constrained during + training to have orthonormal rows (or columns, whichever + is fewer). You can choose a positive nonzero value different + than 1.0 to have a scaled orthonormal matrix, i.e. with singular + values at the selected value (e.g. 0.5, or 2.0). + This is not enforced inside the component + itself; you have to call ConstrainOrthonormal() + from the training code to do this. All this component + does is return the OrthonormalConstraint() value. Options to the natural gradient (you won't normally have to set these, the defaults are suitable): @@ -982,14 +992,19 @@ class LinearComponent: public UpdatableComponent { explicit LinearComponent(const LinearComponent &other); explicit LinearComponent(const CuMatrix ¶ms); + + BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } + CuMatrixBase &Params() { return params_; } + const CuMatrixBase &Params() const { return params_; } private: // disallow assignment operator. LinearComponent &operator= ( const LinearComponent&); - CuMatrix params_; + + BaseFloat orthonormal_constraint_; // If true (and if no this->is_gradient_), use natural gradient updates. bool use_natural_gradient_; OnlineNaturalGradient preconditioner_in_; @@ -1460,8 +1475,12 @@ class PermuteComponent: public Component { -// PerElementScaleComponent scales each dimension of its input with a separate -// trainable scale; it's like a linear component with a diagonal matrix. +/** + PerElementScaleComponent scales each dimension of its input with a separate + trainable scale; it's like a linear component with a diagonal matrix. This + version (and its child class NaturalGradientPerElementScaleComponent) + requires the input for backprop. See also ScaleAndOffsetComponent. +*/ class PerElementScaleComponent: public UpdatableComponent { public: virtual int32 InputDim() const { return scales_.Dim(); } @@ -1474,7 +1493,7 @@ class PerElementScaleComponent: public UpdatableComponent { virtual std::string Type() const { return "PerElementScaleComponent"; } virtual int32 Properties() const { return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput| - kPropagateInPlace; + kPropagateInPlace|kBackpropInPlace; } virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, @@ -1686,8 +1705,7 @@ class ConstantFunctionComponent: public UpdatableComponent { // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but -// it uses a natural gradient update for the per-element scales, and enforces a -// maximum amount of change per minibatch, for stability. +// it uses a natural gradient update for the per-element scales. class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { public: diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 30cd47b3eb2..b73df647392 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -111,6 +111,10 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, // happens when we use the model with batchnorm test-mode set). ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_); + // The following will only do something if we have a LinearComponent + // with is-constrained-orthonormal set to true. + ConstrainOrthonormal(nnet_); + // Scale deta_nnet if (success) ScaleNnet(config_.momentum, delta_nnet_); @@ -158,6 +162,21 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, max_change_scale, scale_adding, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); + if (is_backstitch_step1) { + // The following will only do something if we have a LinearComponent + // with is-constrained-orthonormal set to true. We choose to do this + // only on the 1st backstitch step, for efficiency. + ConstrainOrthonormal(nnet_); + } + + if (!is_backstitch_step1) { + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when we use the model with batchnorm test-mode set). Do this + // after backstitch step 2 so that the stats are scaled down before we start + // the next minibatch. + ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_); + } + ScaleNnet(0.0, delta_nnet_); } diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 64fc3003609..21b80c15732 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -859,6 +859,79 @@ class SvdApplier { std::string component_name_pattern_; }; +// Does an update that moves M closer to being a (matrix with +// orthonormal rows) times 'scale'. Note: this will diverge if +// we start off with singular values too far from 'scale'. +void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase *M) { + // Larger alpha will update faster but will be more prone to instability. I + // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or + // it will always be unstable. It should be > 0.0. + // The factor of 1/scale^4 is, I *believe*, going to give us the right + // kind of invariance w.r.t. the scale. + BaseFloat alpha = 0.125 / (scale * scale * scale * scale); + + // We're enforcing the rows to be orthonormal. + // define P = M M^T. If P is unit then M has orthonormal rows. + // We actually want P to equal scale^2 * I, so that M's rows are + // orthogonal with 2-norms equal to 'scale'. + // We (notionally) add to the objective function, the value + // -alpha times the sum of squared elements of Q = (P- scale^2 * I). + int32 rows = M->NumRows(), cols = M->NumCols(); + CuMatrix M_update(rows, cols); + CuMatrix P(rows, rows); + P.SymAddMat2(1.0, *M, kNoTrans, 0.0); + P.CopyLowerToUpper(); + P.AddToDiag(-1.0 * scale * scale); + + if (GetVerboseLevel() >= 1) { + BaseFloat error = P.FrobeniusNorm(); + KALDI_VLOG(1) << "Error in orthogonality is " << error; + } + + // At this point, the matrix P contains what, in the math, would be Q = + // P-scale^2*I. The derivative of the objective function w.r.t. an element q(i,j) + // of Q is now equal to -2*alpha*q(i,j), i.e. we could write q_deriv(i,j) + // = -2*alpha*q(i,j) This is also the derivative of the objective function + // w.r.t. p(i,j): i.e. p_deriv(i,j) = -2*alpha*q(i,j). + // Suppose we have define this matrix as 'P_deriv'. + // The derivative of the objective w.r.t M equals + // 2 * P_deriv * M, which equals -4*alpha*(P-scale^2*I)*M. + // (Currently the matrix P contains what, in the math, is P-scale^2*I). + M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0); + M->AddMat(1.0, M_update); +} + +/** + This function, to be called after processing every minibatch, is responsible + for enforcing the orthogonality constraint for any components of type + LinearComponent that have the "orthonormal_constraint" value set. + */ +void ConstrainOrthonormal(Nnet *nnet) { + for (int32 c = 0; c < nnet->NumComponents(); c++) { + Component *component = nnet->GetComponent(c); + LinearComponent *lc = dynamic_cast(component); + if (lc == NULL || lc->OrthonormalConstraint() == 0.0) + continue; + if (RandInt(0, 3) != 0) + continue; // For efficiency, only do this every 4 minibatches-- it won't + // stray far. + + + BaseFloat scale = lc->OrthonormalConstraint(); + KALDI_ASSERT(scale > 0.0); + + CuMatrixBase ¶ms = lc->Params(); + int32 rows = params.NumRows(), cols = params.NumCols(); + if (rows <= cols) { + ConstrainOrthonormalInternal(scale, ¶ms); + } else { + CuMatrix params_trans(params, kTrans); + ConstrainOrthonormalInternal(scale, ¶ms_trans); + params.CopyFromMat(params_trans, kTrans); + } + } +} + // This code has been broken out of ReadEditConfig as it's quite long. // It implements the internals of the edit directive 'reduce-rank'. diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index d961b7cb6a0..b3dace8686f 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -251,7 +251,6 @@ struct CollapseModelConfig { void CollapseModel(const CollapseModelConfig &config, Nnet *nnet); - /** ReadEditConfig() reads a file with a similar-looking format to the config file read by Nnet::ReadConfig(), but this consists of a sequence of operations to @@ -452,6 +451,17 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, Nnet *nnet); +/** + This function, to be called after processing every minibatch, is responsible + for enforcing the orthogonality constraint for any components of type + LinearComponent that have the "orthonormal-constraint" value set to nonzero. + + In order to make it efficient on GPU, it doesn't make it completely orthonormal, + it just makes it closer to being orthonormal (times the 'orthonormal_constraint' + value). Over multiple iterations this rapidly makes it almost exactly orthonormal. + */ +void ConstrainOrthonormal(Nnet *nnet); + /** This utility function can be used to obtain the number of distinct 'n' values in a training example. This is the number of examples (e.g. sequences) that have been combined into a single example. (Actually diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc index 7e937f0c211..25a65dbed5c 100644 --- a/src/nnet3bin/nnet3-show-progress.cc +++ b/src/nnet3bin/nnet3-show-progress.cc @@ -132,6 +132,10 @@ int main(int argc, char *argv[]) { { // Get info about magnitude of parameter change. Nnet diff_nnet(nnet1); AddNnet(nnet2, -1.0, &diff_nnet); + if (GetVerboseLevel() >= 1) { + KALDI_VLOG(1) << "Printing info for the difference between the neural nets: " + << diff_nnet.Info(); + } int32 num_updatable = NumUpdatableComponents(diff_nnet); Vector dot_prod(num_updatable); ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod); @@ -139,12 +143,15 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Parameter differences per layer are " << PrintVectorPerUpdatableComponent(nnet1, dot_prod); - Vector baseline_prod(num_updatable); + Vector baseline_prod(num_updatable), + new_prod(num_updatable); ComponentDotProducts(nnet1, nnet1, &baseline_prod); + ComponentDotProducts(nnet2, nnet2, &new_prod); baseline_prod.ApplyPow(0.5); + new_prod.ApplyPow(0.5); - KALDI_LOG << "Norms of parameter matrices are " - << PrintVectorPerUpdatableComponent(nnet1, baseline_prod); + KALDI_LOG << "Norms of parameter matrices from are " + << PrintVectorPerUpdatableComponent(nnet2, new_prod); dot_prod.DivElements(baseline_prod); KALDI_LOG << "Relative parameter differences per layer are " From f4866afebac0db087cc4d526547cda1bb3fa053e Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 15 Dec 2017 21:53:30 -0500 Subject: [PATCH 022/184] [scripts] Some changes to lstm.py which I don't want to keep all of, but backing up via git. --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 112 ++++++++++++++++---- 1 file changed, 90 insertions(+), 22 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 96f63537a55..208239262b6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -609,12 +609,18 @@ def set_default_configs(self): 'clipping-threshold' : 30.0, 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, + # recurrence-scale is a scale we put on the c_t when doing linear projections + # from it... making it larger than 1 (e.g. 4) helps equalize scales. + 'recurrence-scale': 1.0, 'delay' : -1, # if you want to set 'self-repair-scale' (c.f. the # self-repair-scale-nonlinearity config value in older LSTM layers), you can # add 'self-repair-scale=xxx' to # lstm-nonlinearity-options. 'lstm-nonlinearity-options' : ' max-change=0.75', + # if self-stabilize=true, the W_all will be a + # LinearComponent followed by a ScaleAndOffsetComponent. + 'self-stabilize': False, # the affine layer contains 4 of our old layers -> use a # larger max-change than the normal value of 0.75. 'ng-affine-options' : ' max-change=1.5', @@ -712,9 +718,19 @@ def _generate_lstm_config(self): # providing output to gate i and operating on an appended vector [x,r] configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.") - configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4, - affine_str, l2_regularize_option)) + + if not self.config['self-stabilize']: + configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4, + affine_str, l2_regularize_option)) + else: + configs.append("component name={0}.W_all type=LinearComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4, + affine_str, l2_regularize_option)) + configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} " + "max-change=0.75".format(name, cell_dim * 4)) + + configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") @@ -729,10 +745,18 @@ def _generate_lstm_config(self): configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str)) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, " - "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay)) + configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " + "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format( + name, input_descriptor, self.config['recurrence-scale'], delay)) + if self.config['self-stabilize']: + configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name)) + W_all_name = 'W_all_so' + else: + W_all_name = 'W_all' + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format( + name, W_all_name, delay)) # we can print .c later if needed, but it generates a warning since it's not used. could use c_trunc instead #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) @@ -796,7 +820,11 @@ def set_default_configs(self): 'clipping-threshold' : 30.0, 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, + 'recurrence-scale': 4.0, 'delay' : -1, + # if self-stabilize=true, the W_all_b will be a + # LinearComponent followed by a ScaleAndOffsetComponent. + 'self-stabilize': False, # if you want to set 'self-repair-scale' (c.f. the # self-repair-scale-nonlinearity config value in older LSTM layers), you can # add 'self-repair-scale=xxx' to @@ -900,8 +928,9 @@ def _generate_lstm_config(self): # This differs from that code by a factorization of the W_all matrix. configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, bottleneck_dim, - affine_str, l2_regularize_option)) + "orthonormal-constraint=1.0 output-dim={2} {3} {4}".format( + name, input_dim + cell_dim, bottleneck_dim, + affine_str, l2_regularize_option)) normalize_type = self.config['normalize-type'] if normalize_type == 'batchnorm': configs.append("component name={0}.W_batchnorm type=BatchNormComponent dim={1} ".format( @@ -910,9 +939,17 @@ def _generate_lstm_config(self): configs.append("component name={0}.W_renorm type=NormalizeComponent dim={1} ".format( name, bottleneck_dim)) - configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, - affine_str, l2_regularize_option)) + if not self.config['self-stabilize']: + configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, + affine_str, l2_regularize_option)) + else: + configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, + affine_str, l2_regularize_option)) + configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} " + "max-change=0.75".format(name, cell_dim * 4)) + configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") @@ -929,7 +966,9 @@ def _generate_lstm_config(self): configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay)) +# "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay)) + "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format( + name, input_descriptor, self.config['recurrence-scale'], delay)) if normalize_type != 'none': configs.append("component-node name={0}.W_{1} component={0}.W_{1} " "input={0}.W_all_a".format(name, @@ -939,8 +978,16 @@ def _generate_lstm_config(self): else: configs.append("component-node name={0}.W_all_b component={0}.W_all_b " "input={0}.W_all_a".format(name)) + if self.config['self-stabilize']: + configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so " + "input={0}.W_all_b".format(name)) + W_all_b_name = 'W_all_b_so' + else: + W_all_b_name = 'W_all_b' + configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.W_all_b, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format( + name, W_all_b_name, delay)) # we can print .c later if needed, but it generates a warning since it's not used. could use c_trunc instead #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) @@ -1016,6 +1063,11 @@ def set_default_configs(self): # add 'self-repair-scale=xxx' to # lstm-nonlinearity-options. 'lstm-nonlinearity-options' : ' max-change=0.75', + # If you set 'self-stabilize=true', for W_all_a, instead + # of a NaturalGradientAffineComponent, it has a LinearComponent followed + # by a ScaleAndOffsetComponent. This is similar to + # "SELF-STABILIZED DEEP NEURAL NETWORK" by Ghahremani and Droppo. + 'self-stabilize': False, # the affine layer contains 4 of our old layers -> use a # larger max-change than the normal value of 0.75. 'ng-affine-options' : ' max-change=1.5', @@ -1136,9 +1188,18 @@ def _generate_lstm_config(self): # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] configs.append("## Begin LTSM layer '{0}'".format(name)) configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.") - configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4, - affine_str, l2_regularize_option)) + if self.config['self-stabilize']: + # have LinearComponent followed by ScaleAndOffsetComponent. + configs.append("component name={0}.W_all type=LinearComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4, + affine_str, l2_regularize_option)) + configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} " + "max-change=0.75".format(name, cell_dim * 4)) + else: + # have NaturalGradientAffineComponent + configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4, + affine_str, l2_regularize_option)) configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") @@ -1156,24 +1217,31 @@ def _generate_lstm_config(self): .format(name, dropout_proportion)) configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent"); configs.append("# and non-recurrent projections") - configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format( + configs.append("component name={0}.W_rp type=LinearComponent orthonormal-constraint=2.0 " + "input-dim={1} output-dim={2} {3} {4}".format( name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str, l2_regularize_option)) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, " + configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay)) + if self.config['self-stabilize']: + configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name)) + W_all_name = 'W_all_so' + else: + W_all_name = 'W_all' + if dropout_proportion != -1.0: # note: the 'input' is a don't-care as the component never uses it; it's required # in component-node lines. configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask " "input={0}.dropout_mask".format(name)) configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)" - .format(name, delay)) + "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})), {0}.dropout_mask)" + .format(name, W_all_name, delay)) else: configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay)) + "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format( + name, W_all_name, delay)) configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin " "dim-offset=0 dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin " From 2fd01ee65c1299f711b7f17422d6169e5622bb34 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 16 Dec 2017 01:33:28 -0500 Subject: [PATCH 023/184] [src] Fix bug in LinearComponent whereby use-natural-gradient defaulted to false. --- src/nnet3/nnet-simple-component.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 608f3284885..d7957f24102 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2969,12 +2969,14 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { int32 rank_in = 20, rank_out = 80, update_period = 4; BaseFloat alpha = 4.0, num_samples_history = 2000.0; + use_natural_gradient_ = true; cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); cfl->GetValue("rank-in", &rank_in); cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); + cfl->GetValue("use-natural-gradient", &use_natural_gradient_); preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); From 407f239449c9152cfb81ff57012cee2cceb16a04 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 16 Dec 2017 01:33:42 -0500 Subject: [PATCH 024/184] [src] Some refactoring of lstmb layer: using memnorm. --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 148 +++++++----------- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 3 +- 2 files changed, 54 insertions(+), 97 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 208239262b6..5827ea4d179 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -737,12 +737,13 @@ def _generate_lstm_config(self): configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent " "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, l2_regularize_option)) - configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") # Note from Dan: I don't remember why we are applying the backprop # truncation on both c and m appended together, instead of just on c. # Possibly there was some memory or speed or WER reason for it which I # have forgotten about now. - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str)) + configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") + configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} " + "{2}".format(name, 2 * cell_dim, bptrunc_str)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " @@ -775,12 +776,12 @@ def _generate_lstm_config(self): # This class is for lines like -# 'fast-lstmb-layer name=lstm1 input=[-1] delay=-3' -# (you can also call it 'fast-lstmb-batchnorm-layer' if you want it to end -# in a batchnorm component). +# 'lstmb-layer name=lstm1 input=[-1] delay=-3' +# +# TODO: more description # It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix # of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide -# it into two matrices, with batch-norm in between to stabilize the training. +# it into two matrices, with an orbatch-norm in between to stabilize the training. # # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. @@ -808,32 +809,23 @@ def _generate_lstm_config(self): # i.e. history since about t = t-20, can be # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer -class XconfigFastLstmbLayer(XconfigLayerBase): +class XconfigLstmbLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token in [ 'fast-lstmb-layer', 'fast-lstmb-batchnorm-layer' ] + assert first_token == 'lstmb-layer' XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', + self.config = { 'input':'[-1]', 'cell-dim' : -1, # this is a required argument 'bottleneck-dim': -1, # this is a required argument 'clipping-threshold' : 30.0, 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, - 'recurrence-scale': 4.0, 'delay' : -1, - # if self-stabilize=true, the W_all_b will be a - # LinearComponent followed by a ScaleAndOffsetComponent. - 'self-stabilize': False, - # if you want to set 'self-repair-scale' (c.f. the - # self-repair-scale-nonlinearity config value in older LSTM layers), you can - # add 'self-repair-scale=xxx' to - # lstm-nonlinearity-options. 'lstm-nonlinearity-options' : ' max-change=0.75', # the affine layer contains 4 of our old layers -> use a # larger max-change than the normal value of 0.75. 'ng-affine-options' : ' max-change=1.5', - 'normalize-type': 'batchnorm', # can be 'batchnorm', 'renorm', or 'none' 'l2-regularize': 0.0, 'decay-time': -1.0 } @@ -852,30 +844,16 @@ def check_configs(self): self.config['bottleneck-dim'])) if self.config['delay'] == 0: raise RuntimeError("delay cannot be zero") - assert self.config['normalize-type'] in ['batchnorm', 'renorm', 'none'] def auxiliary_outputs(self): - return ['c'] + return [] def output_name(self, auxiliary_output = None): - node_name = ('m_batchnorm' if self.layer_type == 'fast-lstmb-batchnorm-layer' - else 'm') - if auxiliary_output is not None: - if auxiliary_output == 'c': - node_name = 'c' - self.c_needed = True - else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) - return '{0}.{1}'.format(self.name, node_name) + assert auxiliary_output is None + return '{0}.m_batchnorm'.format(self.name) def output_dim(self, auxiliary_output = None): - if auxiliary_output is not None: - if auxiliary_output == 'c': - self.c_needed = True - return self.config['cell-dim'] - # add code for other auxiliary_outputs here when we decide to expose them - else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) + assert auxiliary_output is None return self.config['cell-dim'] def get_full_config(self): @@ -924,31 +902,26 @@ def _generate_lstm_config(self): configs = [] - # See XconfigFastLstmLayer to understand what's going on here. - # This differs from that code by a factorization of the W_all matrix. + # See XconfigFastLstmLayer to understand what's going on here. This + # differs from that code by a factorization of the W_all matrix into two + # pieces with a smaller dimension in between (with the first of the two + # pieces constrained to have orthonormal rows). Note: we don't apply l2 + # regularization to this layer, since, with the orthonormality + # constraint, it's meaningless. configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} " - "orthonormal-constraint=1.0 output-dim={2} {3} {4}".format( + "orthonormal-constraint=1.0 output-dim={2} {3} ".format( name, input_dim + cell_dim, bottleneck_dim, - affine_str, l2_regularize_option)) - normalize_type = self.config['normalize-type'] - if normalize_type == 'batchnorm': - configs.append("component name={0}.W_batchnorm type=BatchNormComponent dim={1} ".format( - name, bottleneck_dim)) - elif normalize_type == 'renorm': - configs.append("component name={0}.W_renorm type=NormalizeComponent dim={1} ".format( - name, bottleneck_dim)) + affine_str)) - if not self.config['self-stabilize']: - configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, - affine_str, l2_regularize_option)) - else: - configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, - affine_str, l2_regularize_option)) - configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} " - "max-change=0.75".format(name, cell_dim * 4)) + configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format( + name, cell_dim)) + + configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " + "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, + affine_str, l2_regularize_option)) + configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} " + "max-change=0.75".format(name, cell_dim * 4)) configs.append("# The core LSTM nonlinearity, implemented as a single component.") @@ -958,49 +931,34 @@ def _generate_lstm_config(self): "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, l2_regularize_option)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - # Note from Dan: I don't remember why we are applying the backprop - # truncation on both c and m appended together, instead of just on c. - # Possibly there was some memory or speed or WER reason for it which I - # have forgotten about now. - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str)) + + configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format( + name, cell_dim, bptrunc_str)) + configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim)) + configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " -# "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay)) - "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format( - name, input_descriptor, self.config['recurrence-scale'], delay)) - if normalize_type != 'none': - configs.append("component-node name={0}.W_{1} component={0}.W_{1} " - "input={0}.W_all_a".format(name, - normalize_type)) - configs.append("component-node name={0}.W_all_b component={0}.W_all_b " - "input={0}.W_{1}".format(name, normalize_type)) - else: - configs.append("component-node name={0}.W_all_b component={0}.W_all_b " - "input={0}.W_all_a".format(name)) - if self.config['self-stabilize']: - configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so " - "input={0}.W_all_b".format(name)) - W_all_b_name = 'W_all_b_so' - else: - W_all_b_name = 'W_all_b' + "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format( + name, input_descriptor, delay)) + configs.append("component-node name={0}.W_all_b component={0}.W_all_b " + "input={0}.W_all_a".format(name)) + configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so " + "input={0}.W_all_b".format(name)) configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format( - name, W_all_b_name, delay)) - # we can print .c later if needed, but it generates a warning since it's not used. could use c_trunc instead - #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) - # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) - - if self.layer_type == "fast-lstmb-batchnorm-layer": - # Add the batchnorm component, if requested to include batchnorm. - configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( - name, cell_dim)) - configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " - "input={0}.m".format(name)) + "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay)) + configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 " + "dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " + "dim={1}".format(name, cell_dim)) + configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name)) + configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm " + "input={0}.c_trunc".format(name)) + configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 0fff379cf31..db9550818cd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -40,8 +40,7 @@ 'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer, 'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer, 'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer, - 'fast-lstmb-layer' : xlayers.XconfigFastLstmbLayer, - 'fast-lstmb-batchnorm-layer' : xlayers.XconfigFastLstmbLayer, + 'lstmb-layer' : xlayers.XconfigLstmbLayer, 'stats-layer': xlayers.XconfigStatsLayer, 'relu-conv-layer': xlayers.XconfigConvLayer, 'conv-layer': xlayers.XconfigConvLayer, From 76250980ec873327c34e93b146c4f2f1ab3336ba Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 18 Dec 2017 17:44:17 -0500 Subject: [PATCH 025/184] [scripts] Cosmetic change: add message --- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 2 ++ egs/wsj/s5/steps/nnet3/get_egs.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index cec6f8e166f..0294df0d84a 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -150,6 +150,8 @@ if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. rm $dir/uniq2utt $dir/valid_uttlist.tmp fi +echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" + cat $data/utt2dur | \ awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ utils/filter_scp.pl --exclude $dir/valid_uttlist | \ diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index a6dd9682616..c8cbf67c8b8 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -138,6 +138,8 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis [ -z "$transform_dir" ] && transform_dir=$alidir +echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" + # because we'll need the features with a different number of jobs than $alidir, # copy to ark,scp. if [ -f $transform_dir/raw_trans.1 ]; then From e652f0eb7891e77eea13803118467a887d07761e Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 18 Dec 2017 18:08:15 -0500 Subject: [PATCH 026/184] [src] Fix optimization bug in nnet3, regarding Scale() expressions> --- src/nnet3/nnet-optimize-utils.cc | 5 ++--- src/nnet3/nnet-optimize.cc | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index e3c5edba565..d28626fee86 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -737,9 +737,7 @@ bool VariableMergingOptimizer::MergeVariables() { // potentially merge into a single variable. const NnetComputation::Command &c = computation_->commands[command_index]; int32 s1 = -1, s2 = -1; - // TODO: add kScale command and remove the check for 1.0 if (c.command_type == kMatrixCopy && - // c.alpha == 1.0 && config_.remove_assignments) { s2 = c.arg1; // s2 is the written-to matrix. s1 = c.arg2; @@ -997,7 +995,8 @@ std::pair VariableMergingOptimizer::MayBeMerged( if (!left && !right) // save some time. return std::pair(false,false); bool is_assignment = (computation_->commands[command_index].command_type == - kMatrixCopy); + kMatrixCopy && + computation_->commands[command_index].alpha == 1.0); ComputationAnalysis analysis(*computation_, analyzer_); if (is_assignment) { if (analysis.FirstNontrivialAccess(s2) == command_index && diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 3b2fe6b5b2f..212f707aefc 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -439,7 +439,7 @@ void ConvertAdditionToAssignment(const Nnet &nnet, case kMatrixAdd: c.command_type = kMatrixCopy; break; case kAddRows: c.command_type = kCopyRows; - break; + break; case kAddRowsMulti: c.command_type = kCopyRowsMulti; break; // note: kCopyToRowsMulti does not currently support alpha != 1.0. From b6e17dd9dfec9f9d80e6e1178fa7a91dc4d1043c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 21 Dec 2017 14:46:48 -0800 Subject: [PATCH 027/184] [src] Simplify/refactor natural gradient code in nnet3 --- src/nnet3/natural-gradient-online-test.cc | 5 +- src/nnet3/natural-gradient-online.cc | 59 +++++++---------------- src/nnet3/natural-gradient-online.h | 55 ++++++++++++++------- src/nnet3/nnet-convolutional-component.cc | 7 +-- src/nnet3/nnet-general-component.cc | 2 +- src/nnet3/nnet-optimize-test.cc | 2 +- src/nnet3/nnet-simple-component.cc | 22 ++++----- 7 files changed, 71 insertions(+), 81 deletions(-) diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc index 7c46dfb3596..2829d4ebde7 100644 --- a/src/nnet3/natural-gradient-online-test.cc +++ b/src/nnet3/natural-gradient-online-test.cc @@ -270,7 +270,7 @@ void UnitTestPreconditionDirectionsOnline() { if (Rand() % 3 == 0) zero = true; //else if (Rand() % 2 == 0) one = true; - CuVector row_prod1(N), row_prod2(N); + CuVector row_prod1(N); BaseFloat gamma1, gamma2; BaseFloat big_eig_factor = RandInt(1, 20); big_eig_factor = big_eig_factor * big_eig_factor; @@ -300,14 +300,13 @@ void UnitTestPreconditionDirectionsOnline() { preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1); - preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2); + preconditioner2.PreconditionDirections(&Mcopy2, &gamma2); BaseFloat trace1 = TraceMatMat(M, M, kTrans), trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans); AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02); AssertEqual(Mcopy1, Mcopy2); - AssertEqual(row_prod1, row_prod2, 1.0e-02); AssertEqual(gamma1, gamma2, 1.0e-02); // make sure positive definite diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index cf0311449db..9c9652559de 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -146,7 +146,7 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { for (int32 i = 0; i < num_init_iters; i++) { BaseFloat scale; R0_copy.CopyFromMat(R0); - this_copy.PreconditionDirections(&R0_copy, NULL, &scale); + this_copy.PreconditionDirections(&R0_copy, &scale); } rank_ = this_copy.rank_; W_t_.Swap(&this_copy.W_t_); @@ -157,21 +157,13 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { void OnlineNaturalGradient::PreconditionDirections( CuMatrixBase *X_t, - CuVectorBase *row_prod, BaseFloat *scale) { if (X_t->NumCols() == 1) { // If the dimension of the space equals one then our natural gradient update // with rescaling becomes a no-op, but the code wouldn't naturally handle it // because rank would be zero. Support this as a special case. - if (row_prod) - row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); - *scale = 1.0; - return; - } - - if (row_prod == NULL) { - CuVector row_prod_tmp(X_t->NumRows()); - PreconditionDirections(X_t, &row_prod_tmp, scale); + if (scale) + *scale = 1.0; return; } @@ -191,7 +183,17 @@ void OnlineNaturalGradient::PreconditionDirections( BaseFloat rho_t(rho_t_); Vector d_t(d_t_); read_write_mutex_.unlock(); - PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale); + + BaseFloat initial_product = TraceMatMat(*X_t, *X_t, kTrans); + PreconditionDirectionsInternal(t, rho_t, initial_product, d_t, &WJKL_t, X_t); + if (scale) { + if (initial_product <= 0.0) { + *scale = 1.0; + } else { + BaseFloat final_product = TraceMatMat(*X_t, *X_t, kTrans); + *scale = sqrt(initial_product / final_product); + } + } } void OnlineNaturalGradient::ReorthogonalizeXt1( @@ -320,11 +322,10 @@ void OnlineNaturalGradient::SelfTest() const { void OnlineNaturalGradient::PreconditionDirectionsInternal( const int32 t, const BaseFloat rho_t, + const BaseFloat tr_X_Xt, const Vector &d_t, CuMatrixBase *WJKL_t, - CuMatrixBase *X_t, - CuVectorBase *row_prod, - BaseFloat *scale) { + CuMatrixBase *X_t) { int32 N = X_t->NumRows(), // Minibatch size. D = X_t->NumCols(), // Dimensions of vectors we're preconditioning R = rank_; // Rank of correction to unit matrix. @@ -383,17 +384,8 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( if (!frozen_) num_updates_skipped_++; - BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans); // X_hat_t = X_t - H_t W_t X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); - // each element i of row_prod will be inner product of row i of X_hat_t with - // itself. - row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); - BaseFloat tr_Xhat_XhatT = row_prod->Sum(); - KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT); // Check for NaN. - BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 : - sqrt(tr_Xt_XtT / tr_Xhat_XhatT)); - *scale = gamma_t; return; } J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0); // J_t = H_t^T X_t @@ -456,31 +448,14 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( if (nf > 0 && self_debug_) { KALDI_WARN << "Floored " << nf << " elements of C_t."; } - BaseFloat tr_Xt_XtT_check; - if (self_debug_) - tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans); X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); // X_hat_t = X_t - H_t W_t - // set *row_prod to inner products of each row of X_hat_t with itself. - row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); - - BaseFloat tr_Xhat_XhatT = row_prod->Sum(); - // tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t) - double tr_Xt_XtT = tr_Xhat_XhatT; - for (int32 i = 0; i < R; i++) - tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i)); - if (self_debug_) { - KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check)); - } - BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 : - sqrt(tr_Xt_XtT / tr_Xhat_XhatT)); - *scale = gamma_t; Vector sqrt_c_t(c_t); sqrt_c_t.ApplyPow(0.5); // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})). - BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT + BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_X_Xt + (1-eta)*(D * rho_t + d_t.Sum()) - sqrt_c_t.Sum()); // D_{t+1} = C_t^{0.5} - \rho_{t+1} I diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index 67c25eb0dbc..0c43c7f5c46 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -437,33 +437,48 @@ class OnlineNaturalGradient { // see comment where 'frozen_' is declared. inline void Freeze(bool frozen) { frozen_ = frozen; } - // The "R" pointer is both the input (R in the comment) and the output (P in - // the comment; equal to the preconditioned directions before scaling by - // gamma). If the pointer "row_prod" is supplied, it's set to the inner product - // of each row of the preconditioned directions P, at output, with itself. - // You would need to apply "scale" to R and "scale * scale" to row_prod, to - // get the preconditioned directions; we don't do this ourselves, in order to - // save CUDA calls. + /** + This call implements the main functionality of this class. + + @param [in,out] R The "R" pointer is both the input (R in the + comment, X in the paper), and the output (P in the comment, + X with a hat on it in the paper). Each row of R is viewed + as a vector in some space, where we're estimating a smoothed + Fisher matrix and then multiplying by the inverse of that + smoothed Fisher matrix. + + @param [out] scale If non-NULL, a scaling factor is written to here, + and the output 'R' should be multiplied by this factor by + the user (we don't do it internally, to save an operation). + The factor is chosen so that the vector 2-norm of R is the + same after the natural gradient as it was before. (The pointer + being NULL or non-NULL doesn't affect the magnitude of R; + in any case the user will probably want to do this rescaling, + the question being whether they want to do so manually or + not. + + */ void PreconditionDirections(CuMatrixBase *R, - CuVectorBase *row_prod, BaseFloat *scale); + + // Copy constructor. explicit OnlineNaturalGradient(const OnlineNaturalGradient &other); // Assignent operator OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other); private: - // This does the work of PreconditionDirections (the top-level - // function handles some multithreading issues and then calls this function). + + // This is an internal function called from PreconditionDirections(), + // which handles some multithreading issues and then calls this function. // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ]. void PreconditionDirectionsInternal(const int32 t, const BaseFloat rho_t, + const BaseFloat tr_X_Xt, const Vector &d_t, CuMatrixBase *WJKL_t, - CuMatrixBase *X_t, - CuVectorBase *row_prod, - BaseFloat *scale); + CuMatrixBase *X_t); void ComputeEt(const VectorBase &d_t, BaseFloat beta_t, @@ -512,10 +527,14 @@ class OnlineNaturalGradient { // or columns. static void InitOrthonormalSpecial(CuMatrixBase *R); - // Returns the learning rate eta as the function of the number of samples - // (actually, N is the number of vectors we're preconditioning, which due to - // context is not always exactly the same as the number of samples). The - // value returned depends on num_samples_history_. + // Returns the value eta (with 0 < eta < 1) which reflects how fast we update + // the estimate of the Fisher matrix (larger == faster). This is a function + // rather than a constant because we set this indirectly, via + // num_samples_history_ or num_minibatches_history_. The argument N is the + // number of vectors we're preconditioning, which is the number of rows in the + // argument R to PreconditionDirections(); you can think of it as the number + // of vectors we're preconditioning (and in the common case it's some multiple + // of the minibatch size) BaseFloat Eta(int32 N) const; // called if self_debug_ = true, makes sure the members satisfy certain @@ -593,13 +612,13 @@ class OnlineNaturalGradient { BaseFloat rho_t_; Vector d_t_; - // Used to prevent parameters being read or written in an inconsistent state. std::mutex read_write_mutex_; // This mutex is used to control which thread gets to update the // parameters, in multi-threaded code. std::mutex update_mutex_; + }; } // namespace nnet3 diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index 333d7a79cfa..bea3b9d31d5 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -389,14 +389,11 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient( // scalars are different across iterations, the scalars // will be pretty similar on different iterations BaseFloat scale1, scale2; - preconditioner_in_.PreconditionDirections(¶ms_deriv, NULL, - &scale1); + preconditioner_in_.PreconditionDirections(¶ms_deriv, &scale1); CuMatrix params_deriv_transpose(params_deriv, kTrans); - preconditioner_out_.PreconditionDirections(¶ms_deriv_transpose, - NULL, &scale2); - + preconditioner_out_.PreconditionDirections(¶ms_deriv_transpose, &scale2); linear_params_.AddMat( learning_rate_ * scale1 * scale2, diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index bfb972f8735..bc7405f2836 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1251,7 +1251,7 @@ void ConstantComponent::Backprop( CuMatrix out_deriv_copy(out_deriv); BaseFloat scale = 1.0; to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, - NULL, &scale); + &scale); to_update->output_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } else { diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc index bcb02184720..35614d62b34 100644 --- a/src/nnet3/nnet-optimize-test.cc +++ b/src/nnet3/nnet-optimize-test.cc @@ -143,7 +143,7 @@ static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed, KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum(); if (!ApproxEqual(output, output_opt)) { KALDI_WARN << "Non-optimized and optimized versions of the computation give " - << "different outputs."; + << "different outputs: " << output << " vs. " << output_opt; return false; } diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index d7957f24102..f2019849117 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -1516,7 +1516,7 @@ void NaturalGradientRepeatedAffineComponent::Update( try { // Only apply the preconditioning/natural-gradient if we're not computing // the exact gradient. - preconditioner_in_.PreconditionDirections(&deriv, NULL, &scale); + preconditioner_in_.PreconditionDirections(&deriv, &scale); } catch (...) { int32 num_bad_rows = 0; for (int32 i = 0; i < out_deriv.NumRows(); i++) { @@ -2132,7 +2132,7 @@ void PerElementOffsetComponent::Backprop( // this scenario) CuMatrix out_deriv_copy(out_deriv_reshaped); BaseFloat scale = 1.0; - to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, NULL, + to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, &scale); to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); @@ -2417,7 +2417,7 @@ void ScaleAndOffsetComponent::BackpropInternal( BaseFloat scale = 1.0; CuMatrix out_deriv_copy(out_deriv); to_update->offset_preconditioner_.PreconditionDirections( - &out_deriv_copy, NULL, &scale); + &out_deriv_copy, &scale); to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } @@ -2440,7 +2440,7 @@ void ScaleAndOffsetComponent::BackpropInternal( BaseFloat scale = 1.0; if (to_update->use_natural_gradient_ && !to_update->is_gradient_) { to_update->scale_preconditioner_.PreconditionDirections( - &in_value_reconstructed, NULL, &scale); + &in_value_reconstructed, &scale); } to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_, in_value_reconstructed); @@ -2506,7 +2506,7 @@ void ConstantFunctionComponent::Backprop( CuMatrix out_deriv_copy(out_deriv); BaseFloat scale = 1.0; to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, - NULL, &scale); + &scale); to_update->output_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } else { @@ -2847,8 +2847,8 @@ void NaturalGradientAffineComponent::Update( // than having the matrices scaled inside the preconditioning code). BaseFloat in_scale, out_scale; - preconditioner_in_.PreconditionDirections(&in_value_temp, NULL, &in_scale); - preconditioner_out_.PreconditionDirections(&out_deriv_temp, NULL, &out_scale); + preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale); + preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale); // "scale" is a scaling factor coming from the PreconditionDirections calls // (it's faster to have them output a scaling factor than to have them scale @@ -3077,9 +3077,9 @@ void LinearComponent::Backprop(const std::string &debug_info, // than having the matrices scaled inside the preconditioning code). BaseFloat in_scale, out_scale; to_update->preconditioner_in_.PreconditionDirections(&in_value_temp, - NULL, &in_scale); + &in_scale); to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp, - NULL, &out_scale); + &out_scale); BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_; to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans, @@ -3753,7 +3753,7 @@ void NaturalGradientPerElementScaleComponent::Update( // scales_.AddRowSumMat(learning_rate_, derivs_per_frame). BaseFloat scale; - preconditioner_.PreconditionDirections(&derivs_per_frame, NULL, &scale); + preconditioner_.PreconditionDirections(&derivs_per_frame, &scale); CuVector delta_scales(scales_.Dim()); delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame); @@ -5632,7 +5632,7 @@ void LstmNonlinearityComponent::Backprop( BaseFloat scale = 1.0; if (!to_update->is_gradient_) { to_update->preconditioner_.PreconditionDirections( - ¶ms_deriv, NULL, &scale); + ¶ms_deriv, &scale); } to_update->params_.AddMat(to_update->learning_rate_ * scale, params_deriv); From ea7efbade674d16b306c951ab047b8fc39c9697c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 21 Dec 2017 19:11:00 -0800 Subject: [PATCH 028/184] [src] Fix bug in compilation with Scale() expressions. --- src/nnet3/nnet-compile.cc | 10 +++++----- src/nnet3/nnet-descriptor.cc | 2 ++ src/nnet3/nnet-optimize-utils.cc | 1 + src/nnet3/nnet-optimize.cc | 4 ++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc index bac182a5ac5..93f35dc8615 100644 --- a/src/nnet3/nnet-compile.cc +++ b/src/nnet3/nnet-compile.cc @@ -449,7 +449,6 @@ void Compiler::ComputeInputLocationsList( const std::vector &output_indexes = step_info.output_indexes; const NetworkNode &node = nnet_.GetNode(step_info.node_index); const SumDescriptor &descriptor = node.descriptor.Part(part_index); - int32 num_indexes = output_indexes.size(); submat_locations_list->clear(); submat_locations_list->resize(num_indexes); @@ -664,6 +663,7 @@ void Compiler::CompileForwardSumDescriptor( int32 value_submatrix_index = step_info.value_parts[part_index]; const SumDescriptor &descriptor = nnet_.GetNode(step_info.node_index).descriptor.Part(part_index); + BaseFloat offset_term = descriptor.GetScaleForNode(-1); if (offset_term != 0.0) { computation->commands.push_back( @@ -763,10 +763,10 @@ void Compiler::CompileForwardFromSubmatLocations( std::vector indexes; if (ConvertToIndexes(submat_locations, &input_submatrix_index, &indexes)) { CompileForwardFromIndexes(value_submatrix_index, - input_submatrix_index, - alpha, - indexes, - computation); + input_submatrix_index, + alpha, + indexes, + computation); return; } else { // There are multiple source matrices. diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc index 55bbb258b52..fb3d152dc2e 100644 --- a/src/nnet3/nnet-descriptor.cc +++ b/src/nnet3/nnet-descriptor.cc @@ -841,6 +841,7 @@ bool GeneralDescriptor::Normalize(GeneralDescriptor *desc) { desc->descriptor_type_ = child->descriptor_type_; desc->value1_ = child->value1_; desc->value2_ = child->value2_; + desc->alpha_ = child->alpha_; child->descriptors_.clear(); // avoid delete in destructor. delete child; changed = true; @@ -897,6 +898,7 @@ bool GeneralDescriptor::Normalize(GeneralDescriptor *desc) { desc->descriptors_.swap(child->descriptors_); desc->value1_ = child->value1_; desc->value2_ = child->value2_; + desc->alpha_ = child->alpha_; child->descriptors_.clear(); // avoid delete in destructor. delete child; changed = true; diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index d28626fee86..ded700dbbd8 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -2729,6 +2729,7 @@ void ComputationExpander::ExpandRowsCommand( // in the vector are row-indexes into s2. int32 old_arg3 = c_out->arg3; c_out->arg3 = expanded_computation_->indexes.size(); + c_out->alpha = c_in.alpha; expanded_computation_->indexes.push_back(std::vector()); std::vector &new_indexes = expanded_computation_->indexes.back(); const std::vector &old_indexes = computation_.indexes[old_arg3]; diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 212f707aefc..e12cb7b1c42 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -852,6 +852,10 @@ const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut( need_debug_info, num_n_values, ans); seconds_taken_expand_ += timer.Elapsed(); } + if (GetVerboseLevel() >= 3) { + CheckComputation(nnet_, *ans, false); + } + { Timer timer; ans->ComputeCudaIndexes(); From 69d0d380b5e14771f0284429d8626f02e6b3f6a1 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 22 Dec 2017 18:45:17 -0500 Subject: [PATCH 029/184] [scripts] Fixing bug in fast-lstm-layer and lstmb-layer whereby c was used instead of m for affine transform. --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 27 ++++++++++----------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 5827ea4d179..3743413ab34 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -737,17 +737,13 @@ def _generate_lstm_config(self): configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent " "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, l2_regularize_option)) - # Note from Dan: I don't remember why we are applying the backprop - # truncation on both c and m appended together, instead of just on c. - # Possibly there was some memory or speed or WER reason for it which I - # have forgotten about now. configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} " "{2}".format(name, 2 * cell_dim, bptrunc_str)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " - "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format( + "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format( name, input_descriptor, self.config['recurrence-scale'], delay)) if self.config['self-stabilize']: configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name)) @@ -763,6 +759,7 @@ def _generate_lstm_config(self): configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) if self.layer_type == "fast-lstm-batchnorm-layer": # Add the batchnorm component, if requested to include batchnorm. @@ -914,7 +911,7 @@ def _generate_lstm_config(self): name, input_dim + cell_dim, bottleneck_dim, affine_str)) - configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format( + configs.append("component name={0}.m_trunc_memnorm type=MemoryNormComponent dim={1} ".format( name, cell_dim)) configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " @@ -932,15 +929,15 @@ def _generate_lstm_config(self): l2_regularize_option)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format( - name, cell_dim, bptrunc_str)) + configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( + name, 2 * cell_dim, bptrunc_str)) configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( name, cell_dim)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format( + "IfDefined(Offset(Scale(1.0, {0}.m_trunc_memnorm), {2})))".format( name, input_descriptor, delay)) configs.append("component-node name={0}.W_all_b component={0}.W_all_b " "input={0}.W_all_a".format(name)) @@ -950,13 +947,15 @@ def _generate_lstm_config(self): configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( name, delay)) - configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 " - "dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " "dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name)) - configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm " - "input={0}.c_trunc".format(name)) + configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) + configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 " + "dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} " + "dim={1}".format(name, cell_dim)) + configs.append("component-node name={0}.m_trunc_memnorm component={0}.m_trunc_memnorm " + "input={0}.m_trunc".format(name)) configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) From 823cfe738703eb938b39a9f73dbc3e7da13896a8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 22 Dec 2017 19:05:00 -0500 Subject: [PATCH 030/184] [egs] Extend compare_wer_general.sh for tedlium to print num-params --- egs/tedlium/s5_r2/local/chain/compare_wer_general.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh index 00b2d29cc88..88dde1ff0e2 100755 --- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh +++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh @@ -102,5 +102,10 @@ for x in $*; do prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') printf "% 10s" $prob done +echo +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done echo From 6f781145cdadeaf351f73fc51d0e93cd51df0f22 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 22 Dec 2017 19:05:22 -0500 Subject: [PATCH 031/184] [scripts] Cosmetic fix to chain training script --- egs/wsj/s5/steps/nnet3/chain/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index b62f5510e3c..7f607abd8dc 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -588,7 +588,7 @@ def train(args, run_opts): with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f: f.write(report) - common_lib.execute_command("steps/info/nnet3_dir_info.pl " + common_lib.execute_command("steps/info/chain_dir_info.pl " "{0}".format(args.dir)) From acda336781a2acdbbb5f49c32712335e374faede Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 22 Dec 2017 23:53:15 -0800 Subject: [PATCH 032/184] [src] Diagonal-natural-gradient --- src/nnet3/natural-gradient-online.cc | 216 ++++++++++++++++++--------- src/nnet3/natural-gradient-online.h | 135 +++++++++++++++-- src/nnet3/nnet-simple-component.cc | 169 ++++++++++++++------- src/nnet3/nnet-simple-component.h | 21 ++- 4 files changed, 399 insertions(+), 142 deletions(-) diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index 9c9652559de..4c4d5a1b888 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -26,8 +26,9 @@ namespace nnet3 { OnlineNaturalGradient::OnlineNaturalGradient(): rank_(40), update_period_(1), num_samples_history_(2000.0), num_minibatches_history_(0.0), alpha_(4.0), - epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(-1), - num_updates_skipped_(0), self_debug_(false) { } + epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0), + self_debug_(false), + diagonal_power_(0.0), diagonal_epsilon_(1.0e-03) { } /** @@ -123,6 +124,7 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { // for locking reasons it's better to use a different object. OnlineNaturalGradient this_copy(*this); this_copy.InitDefault(D); + this_copy.t_ = 1; // Prevent recursion to Init() again. CuMatrix R0_copy(R0.NumRows(), R0.NumCols(), kUndefined); // 'num_iters' is number of iterations with the same data from a pseudorandom @@ -152,7 +154,6 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { W_t_.Swap(&this_copy.W_t_); d_t_.Swap(&this_copy.d_t_); rho_t_ = this_copy.rho_t_; - t_ = 0; } void OnlineNaturalGradient::PreconditionDirections( @@ -167,25 +168,28 @@ void OnlineNaturalGradient::PreconditionDirections( return; } - read_write_mutex_.lock(); - if (t_ == -1) // not initialized + if (t_ == 0) // not initialized Init(*X_t); - // Now t_ >= 0. - // We create local copies of the class variables... this is intended for - // multi-threaded safety so we can't read them in an inconsistent state, - // but we don't really waste anything here (a copy of W_t is needed anyway, - // if we're to update it). - int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols(); + int32 R = W_t_.NumRows(), D = W_t_.NumCols(); // space for W_t, J_t, K_t, L_t. CuMatrix WJKL_t(2 * R, D + R); WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_); BaseFloat rho_t(rho_t_); Vector d_t(d_t_); - read_write_mutex_.unlock(); - BaseFloat initial_product = TraceMatMat(*X_t, *X_t, kTrans); - PreconditionDirectionsInternal(t, rho_t, initial_product, d_t, &WJKL_t, X_t); + bool updating = Updating(); + + BaseFloat initial_product; + if (diagonal_power_ == 0.0 || scale != NULL) + initial_product = TraceMatMat(*X_t, *X_t, kTrans); + + if (diagonal_power_ == 0.0) + PreconditionDirectionsInternal(rho_t, initial_product, + updating, d_t, &WJKL_t, X_t); + else + PreconditionDirectionsDiagonal(rho_t, updating, d_t, &WJKL_t, X_t); + if (scale) { if (initial_product <= 0.0) { *scale = 1.0; @@ -194,6 +198,115 @@ void OnlineNaturalGradient::PreconditionDirections( *scale = sqrt(initial_product / final_product); } } + t_ += 1; +} + +void OnlineNaturalGradient::PreconditionDirectionsDiagonal( + const BaseFloat rho_t, + bool updating, + const Vector &d_t, + CuMatrixBase *WJKL_t, + CuMatrixBase *X_t) { + KALDI_ASSERT(diagonal_power_ > 0.0 && diagonal_power_ <= 1.0 && + (diagonal_mean_.Dim() != 0 || updating)); + + int32 dim = X_t->NumCols(); + + if (diagonal_mean_.Dim() == 0) { + InitDiagonalParams(*X_t); + updating = false; + } + + CuVector new_diagonal_mean, new_diagonal_uvar; + + if (updating) { + new_diagonal_mean.Resize(dim, kUndefined); + new_diagonal_uvar.Resize(dim, kUndefined); + UpdateDiagonalStats(*X_t, &new_diagonal_mean, &new_diagonal_uvar); + } + + X_t->MulColsVec(diagonal_scale_); + + PreconditionDirectionsInternal(rho_t, TraceMatMat(*X_t, *X_t, kTrans), false, + d_t, WJKL_t, X_t); + + // We apply the scale both before and after the identity-plus-low-rank matrix, + // so that the combined matrix is symmetric. + X_t->MulColsVec(diagonal_scale_); + + + // If we're updating the diagonal mean and variance we do so *after* + // preconditioning the data. This is out of a concern about the provability + // of convergence (making it independent of the current minibatch). Most + // likely, in practice it would work fine updating it before, it might even be + // a little bit more stable. Anyway, this is how we're doing it, and it's how + // we did it for the core part of the natural gradient. + if (updating) { + diagonal_mean_.Swap(&new_diagonal_mean); + diagonal_uvar_.Swap(&new_diagonal_uvar); + UpdateDiagonalScale(); + } +} + +void OnlineNaturalGradient::UpdateDiagonalStats( + const CuMatrixBase &X, + CuVectorBase *diagonal_mean_new, + CuVectorBase *diagonal_uvar_new){ + int32 dim = X.NumCols(), num_rows = X.NumRows(); + KALDI_ASSERT(diagonal_mean_new->Dim() == dim && diagonal_uvar_new->Dim() == dim && + diagonal_mean_.Dim() == dim); + BaseFloat eta = Eta(X.NumRows()); + // 'eta' is a value that reflects how fast we update these stats, which is + // smaller if we're updating them slower, but strictly less than 1. It's + // basically the scale on the new stats, with 1-eta being the scale on the old + // stats. + KALDI_ASSERT(eta > 0 && eta < 1.0); + + diagonal_mean_new->CopyFromVec(diagonal_mean_); + diagonal_uvar_new->CopyFromVec(diagonal_uvar_); + + diagonal_mean_new->AddRowSumMat(eta / num_rows, X, 1.0 - eta); + diagonal_uvar_new->AddDiagMat2(eta / num_rows, X, kTrans, 1.0 - eta); +} + +void OnlineNaturalGradient::InitDiagonalParams( + const CuMatrixBase &X) { + int32 dim = X.NumCols(), num_rows = X.NumRows(); + diagonal_mean_.Resize(dim); + diagonal_uvar_.Resize(dim); + diagonal_mean_.AddRowSumMat(1.0 / num_rows, X, 0.0); + diagonal_uvar_.AddDiagMat2(1.0 / num_rows, X, kTrans, 0.0); + UpdateDiagonalScale(); +} + + +void OnlineNaturalGradient::UpdateDiagonalScale() { + KALDI_ASSERT(diagonal_mean_.Dim() != 0); + int32 dim = diagonal_mean_.Dim(); + if (diagonal_scale_.Dim() != dim) + diagonal_scale_.Resize(dim); + diagonal_scale_.CopyFromVec(diagonal_uvar_); + diagonal_scale_.AddVecVec(-1.0, diagonal_mean_, diagonal_mean_, 1.0); + // At this point, diagonal_scale_ is the diagonal of the (centered) variance + // estimated from the x and x2 statistics, prior to any flooring or + // scaling. + BaseFloat avg_variance = diagonal_scale_.Sum() / dim; + if (avg_variance <= 1.0e-20) { + // either the data is all zero or very tiny, or something went wrong. Just + // set diagonal_scale_ to a constant. + diagonal_scale_.Set(1.0); + } else { + BaseFloat floor = diagonal_epsilon_ * avg_variance; + diagonal_scale_.ApplyFloor(floor); + // The following statement scales diagonal_scale_ so its average is close to + // 1, which helps keep things in a reasonable numeric range. There is no + // reason why it has to be exactly one, and the whole thing is mathematically + // invariant to this scaling factor-- we output the scaling factor 'scale' + // from PreconditionDirections() so that the user can rescale so the vector + // 2-norm of the X_t matrix is the same as was before the natural gradent. + diagonal_scale_.Scale(1.0 / avg_variance); + diagonal_scale_.ApplyPow(-0.5 * diagonal_power_); + } } void OnlineNaturalGradient::ReorthogonalizeXt1( @@ -320,9 +433,9 @@ void OnlineNaturalGradient::SelfTest() const { } void OnlineNaturalGradient::PreconditionDirectionsInternal( - const int32 t, const BaseFloat rho_t, const BaseFloat tr_X_Xt, + bool updating, const Vector &d_t, CuMatrixBase *WJKL_t, CuMatrixBase *X_t) { @@ -344,46 +457,9 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0); // H_t = X_t W_t^T - bool locked = update_mutex_.try_lock(); - if (locked) { - // We'll release the lock if we don't plan to update the parameters. - - // Explanation of the conditions below: - // if (frozen_) because we don't do the update is the user called Freeze(). - // I forget why the (t_ > t) is here; probably some race condition encountered - // a long time ago. Not important; nnet3 doesn't use multiple threads anyway. - // The condition: - // (num_updates_skipped_ < update_period_ - 1 && t_ >= num_initial_updates) - // means we can update if either we're in the first 10 updates (e.g. first - // 10 minibatches), or if we've skipped 'update_period_ - 1' batches of data - // without updating the parameters (this allows us to update only, say, - // every 4 times, for speed, after updating the first 10 times). - - // Just hard-code it here that we do 10 initial updates before skipping any. - const int num_initial_updates = 10; - if (frozen_ || t_ > t || (num_updates_skipped_ < update_period_ - 1 && - t_ >= num_initial_updates)) { - update_mutex_.unlock(); - // We got the lock but we were already beaten to it by another thread, or - // we don't want to update yet due to update_period_ > 1 (this saves - // compute), so release the lock. - locked = false; - } - } - - if (!locked) { - // We're not updating the parameters, either because another thread is - // working on updating them, or because another thread already did so from - // the same or later starting point (making our update stale), or because - // update_period_ > 1. We just apply the preconditioning and return. - - // note: we don't bother with any locks before checking frozen_ or incrementing - // num_updates_skipped_ below, because the worst that could happen is that, - // on very rare occasions, we could skip one or two more updates than we - // intended. - if (!frozen_) - num_updates_skipped_++; - + if (!updating) { + // We're not updating the estimate of the Fisher matrix; we just apply the + // preconditioning and return. // X_hat_t = X_t - H_t W_t X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); return; @@ -481,22 +557,25 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( &L_t); } - // Commit the new parameters. - read_write_mutex_.lock(); - KALDI_ASSERT(t_ == t); // we already ensured this. - t_ = t + 1; - num_updates_skipped_ = 0; W_t_.Swap(&W_t1); d_t_.CopyFromVec(d_t1); rho_t_ = rho_t1; if (self_debug_) SelfTest(); +} + +bool OnlineNaturalGradient::Updating() const { + // Just hard-code it here that we do 10 initial updates before skipping any. + // This must be > 'num_init_iters = 3' from Init(). + const int num_initial_updates = 10; - read_write_mutex_.unlock(); - update_mutex_.unlock(); + return (!frozen_ && + (t_ <= num_initial_updates || + (t_ - num_initial_updates) % update_period_ == 0)); } + BaseFloat OnlineNaturalGradient::Eta(int32 N) const { if (num_minibatches_history_ > 0.0) { KALDI_ASSERT(num_minibatches_history_ > 1.0); @@ -610,12 +689,15 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other) num_samples_history_(other.num_samples_history_), num_minibatches_history_(other.num_minibatches_history_), alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_), - frozen_(other.frozen_), - t_(other.t_), num_updates_skipped_(other.num_updates_skipped_), + frozen_(other.frozen_), t_(other.t_), self_debug_(other.self_debug_), W_t_(other.W_t_), - rho_t_(other.rho_t_), d_t_(other.d_t_) { - // use default constructor for the mutexes. -} + rho_t_(other.rho_t_), d_t_(other.d_t_), + diagonal_power_(other.diagonal_power_), + diagonal_epsilon_(other.diagonal_epsilon_), + diagonal_mean_(other.diagonal_mean_), + diagonal_uvar_(other.diagonal_uvar_), + diagonal_scale_(other.diagonal_scale_) { } + OnlineNaturalGradient& OnlineNaturalGradient::operator = ( const OnlineNaturalGradient &other) { diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index 0c43c7f5c46..f2713063492 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -411,6 +411,43 @@ namespace nnet3 { is that this isn't going to be a problem. */ +/** + DIAGONAL_EXTENSION + + This comment explains the diagonal extension to the natural gradient method (this + was not described in the original paper). + + Physically this diagonal scaling happens both before and after the main natural + gradient code. I.e. the main natural gradient code (which makes use + of a scaled-unit-plus-low-rank factorization), happens inside the + space where we've applied the diagonal component of the preconditioning, + so the overall natural-gradient matrix is of the form: + diag scaled-unit-plus-low-rank diag. + The way this is estimated only really makes sense if diagonal_power_ + is either zero or one, but I expect that for in-between values it will + work fine in practice. + + The way the diagonal scaling factor is estimated is that we accumulate mean + and variance stats for each dimension (decaying over time like the previous + natural gradient stats), and set the scaling factor to some power of the + variance estimated this way. The power of the variance used to get the + scaling factor is actually -0.5 times diagonal_power_, the factor of 0.5 + being required because the scaling is applied twice, both before and after + the scaled-unit-plus-low-rank inverse-Fisher matrix, to preserve symmetry. + + It may seem odd that we are taking into account the mean here, while + conceptually it's the uncentered covariance of the vectors that we're + modeling. The reason is that any offset in the vectors we're modeling + can be taken into account by one of the eigenvectors of the low-rank + matrix, so we anticipate that taking the mean out of consideration will + tend to give us a better factorization. This is all a litte bit ad-hoc. + It would be cleaner to formulate this whole thing as learning a factored + representation of the inverse Fisher matrix, but that would become + very complicated, so we just estimate the diagonal in this rather ad-hoc + way and then do the low-rank factorization of the Fisher matrix after + the diagonal preconditioning. + */ + class OnlineNaturalGradient { public: @@ -434,6 +471,11 @@ class OnlineNaturalGradient { int32 GetRank() const { return rank_; } int32 GetUpdatePeriod() const { return update_period_; } + // search above for DIAGONAL_EXTENSION for explanations. Value should + // be between 0 and 1. + void SetDiagonalPower(BaseFloat p) { diagonal_power_ = p; } + BaseFloat GetDiagonalPower() const { return diagonal_power_; } + // see comment where 'frozen_' is declared. inline void Freeze(bool frozen) { frozen_ = frozen; } @@ -470,16 +512,30 @@ class OnlineNaturalGradient { private: - // This is an internal function called from PreconditionDirections(), - // which handles some multithreading issues and then calls this function. + // This is an internal function called from PreconditionDirections(). // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ]. - void PreconditionDirectionsInternal(const int32 t, - const BaseFloat rho_t, + void PreconditionDirectionsInternal(const BaseFloat rho_t, const BaseFloat tr_X_Xt, + bool updating, + const Vector &d_t, + CuMatrixBase *WJKL_t, + CuMatrixBase *X_t); + + // This function is called from PreconditionDirections(), only if + // diagonal_power_ != 0.0 (see comment starting DIAGONAL_EXTENSION above). + // It takes care of the diagonal factors in the Fisher-matrix estimate + // and recurses to PreconditionDirectionsInternal(). + void PreconditionDirectionsDiagonal(const BaseFloat rho_t, + bool updating, const Vector &d_t, CuMatrixBase *WJKL_t, CuMatrixBase *X_t); + + // Works out from t_ and various class variables whether we will update + // the parameters on this iteration (returns true if so). + bool Updating() const; + void ComputeEt(const VectorBase &d_t, BaseFloat beta_t, VectorBase *e_t, @@ -541,6 +597,29 @@ class OnlineNaturalGradient { // properties. void SelfTest() const; + + // This function, called only if diagonal_power_ != 0.0 (see + // DIAGONAL_EXTENSION comment), initializes diagonal_mean_, diagonal_uvar_ and + // diagonal_scale_, with stats from this minibatch (X is the vectors before + // preconditioning, one vector per row). + void InitDiagonalParams(const CuMatrixBase &X); + + // This function, called only if diagonal_power_ != 0.0 (see + // DIAGONAL_EXTENSION comment), sets diagonal_mean_new and diagonal_uvar_new to + // updated versions of the diagonal stats in diagonal_mean_ and diagonal_uvar_: + // changed by scaling down the old stats and then adding in stats from 'X'. + // 'X' is the vectors (one per row) that are doing to multiplied by our + // natural gradient matrix. The provided pointers will be pointers to + // temporaries that will later be copied to class members. + void UpdateDiagonalStats(const CuMatrixBase &X, + CuVectorBase *diagonal_mean_new, + CuVectorBase *diagonal_uvar_new); + + // This function updates diagonal_scale_ from the stats in + // diagonal_mean_ and diagonal_uvar_. + void UpdateDiagonalScale(); + + // Configuration values: // The rank of the correction to the unit matrix (e.g. 20). @@ -596,15 +675,10 @@ class OnlineNaturalGradient { // the *second* time we see the same data (to avoid biasing the update). bool frozen_; - // t is a counter that measures how many updates we've done. + // t is a counter that measures how many times the user has previously called + // PreconditionDirections(); it's 0 if that has never been called. int32 t_; - // This keeps track of how many minibatches we've skipped updating the parameters, - // since the most recent update; it's used in enforcing "update_period_", which - // is a mechanism to avoid spending too much time updating the subspace (which can - // be wasteful). - int32 num_updates_skipped_; - // If true, activates certain checks. bool self_debug_; @@ -612,12 +686,41 @@ class OnlineNaturalGradient { BaseFloat rho_t_; Vector d_t_; - // Used to prevent parameters being read or written in an inconsistent state. - std::mutex read_write_mutex_; + // Things below this point relate to 'diagonal' preconditioning. + // Search above for DIAGONAL_EXTENSION for an in-depth explanation. + + // The diagonal extension is turned off by default (diagonal_power_ == 0.0), + // but you can turn it on by setting diagonal_power_ (probably to some + // positive value not greater than 1, with 1 corresponding to natural + // gradient, and 0.5 corresponding to something more like Adagrad). + BaseFloat diagonal_power_; + + // diagonal_epsilon_ (e.g. 0.001) is a floor on the diagonal elements of the + // variances; this is expressed relative to the average un-floored variance + // over all dimensions (since dynamic ranges differ considerably). + BaseFloat diagonal_epsilon_; + + // dim_ is not a real variable but it is useful for explaining some things + // we're doing below. It's the dimension of the vectors we're preconditioning: + // D in the math and the paper. Is is the same as W_t_.NumCols(). + // int32 dim_; + + // diagonal_mean_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a + // moving-average mean of the vectors we're preconditioning. + CuVector diagonal_mean_; + + // diagonal_xuvar_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a + // decaying average over minibatches of the (diagonal) uncentered variance of + // the input vectors we're preconditioning. + CuVector diagonal_uvar_; + + // diagonal_scale_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a + // vector of scaling factors which is the diagonal part of the inverse-Fisher + // matrix, applied before and after the scaled-unit-plus-low-rank part. + // It is the (floored and rescaled) variance estimated from the stats in + // diagonal_mean_ and diagonal_uvar_, taken to the power -0.5 * diagonal_power_. + CuVector diagonal_scale_; - // This mutex is used to control which thread gets to update the - // parameters, in multi-threaded code. - std::mutex update_mutex_; }; diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index f2019849117..da14c188244 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2649,16 +2649,37 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); + + BaseFloat num_samples_history, alpha; + int32 rank_in, rank_out, update_period; + ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &rank_in_); + ReadBasicType(is, binary, &rank_in); ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &rank_out_); + ReadBasicType(is, binary, &rank_out); + if (PeekToken(is, binary) == 'D') { + ExpectToken(is, binary, ""); + BaseFloat d_in, d_out; + ReadBasicType(is, binary, &d_in); + ReadBasicType(is, binary, &d_out); + preconditioner_in_.SetDiagonalPower(d_in); + preconditioner_out_.SetDiagonalPower(d_out); + } ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &update_period_); + ReadBasicType(is, binary, &update_period); ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &num_samples_history_); + ReadBasicType(is, binary, &num_samples_history); ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &alpha_); + ReadBasicType(is, binary, &alpha); + + preconditioner_in_.SetNumSamplesHistory(num_samples_history); + preconditioner_out_.SetNumSamplesHistory(num_samples_history); + preconditioner_in_.SetAlpha(alpha); + preconditioner_out_.SetAlpha(alpha); + preconditioner_in_.SetRank(rank_in); + preconditioner_out_.SetRank(rank_out); + preconditioner_out_.SetUpdatePeriod(update_period); + if (PeekToken(is, binary) == 'M') { // MaxChangePerSample, long ago removed; back compatibility. ExpectToken(is, binary, ""); @@ -2687,7 +2708,6 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { token != "") KALDI_ERR << "Expected or " << ", got " << token; - SetNaturalGradientConfigs(); } @@ -2697,30 +2717,21 @@ NaturalGradientAffineComponent::NaturalGradientAffineComponent( AffineComponent(linear_params, bias_params, 0.001) { KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() && bias_params.Dim() != 0); - num_samples_history_ = 2000.0; - alpha_ = 4.0; - rank_in_ = 20; - rank_out_ = 80; - update_period_ = 4; - SetNaturalGradientConfigs(); + + // set some default natural gradient configs. + preconditioner_in_.SetRank(20); + preconditioner_out_.SetRank(80); + preconditioner_in_.SetUpdatePeriod(4); + preconditioner_out_.SetUpdatePeriod(4); } void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; std::string matrix_filename; - num_samples_history_ = 2000.0; - alpha_ = 4.0; - rank_in_ = 20; - rank_out_ = 80; - update_period_ = 4; + is_gradient_ = false; // not configurable; there's no reason you'd want this InitLearningRatesFromConfig(cfl); - cfl->GetValue("num-samples-history", &num_samples_history_); - cfl->GetValue("alpha", &alpha_); - cfl->GetValue("rank-in", &rank_in_); - cfl->GetValue("rank-out", &rank_out_); - cfl->GetValue("update-period", &update_period_); if (cfl->GetValue("matrix", &matrix_filename)) { CuMatrix mat; @@ -2759,23 +2770,37 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { bias_params_.Scale(bias_stddev); bias_params_.Add(bias_mean); } + + // Set natural-gradient configs. + BaseFloat num_samples_history = 2000.0, + alpha = 4.0, + diagonal_power_in = 0.0, + diagonal_power_out = 0.0; + int32 rank_in = 20, rank_out = 80, + update_period = 4; + cfl->GetValue("num-samples-history", &num_samples_history); + cfl->GetValue("alpha", &alpha); + cfl->GetValue("rank-in", &rank_in); + cfl->GetValue("rank-out", &rank_out); + cfl->GetValue("update-period", &update_period); + cfl->GetValue("diagonal-power-in", &diagonal_power_in); + cfl->GetValue("diagonal-power-out", &diagonal_power_out); + + preconditioner_in_.SetNumSamplesHistory(num_samples_history); + preconditioner_out_.SetNumSamplesHistory(num_samples_history); + preconditioner_in_.SetAlpha(alpha); + preconditioner_out_.SetAlpha(alpha); + preconditioner_in_.SetRank(rank_in); + preconditioner_out_.SetRank(rank_out); + preconditioner_out_.SetUpdatePeriod(update_period); + preconditioner_in_.SetDiagonalPower(diagonal_power_in); + preconditioner_out_.SetDiagonalPower(diagonal_power_out); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); - SetNaturalGradientConfigs(); -} - -void NaturalGradientAffineComponent::SetNaturalGradientConfigs() { - preconditioner_in_.SetRank(rank_in_); - preconditioner_in_.SetNumSamplesHistory(num_samples_history_); - preconditioner_in_.SetAlpha(alpha_); - preconditioner_in_.SetUpdatePeriod(update_period_); - preconditioner_out_.SetRank(rank_out_); - preconditioner_out_.SetNumSamplesHistory(num_samples_history_); - preconditioner_out_.SetAlpha(alpha_); - preconditioner_out_.SetUpdatePeriod(update_period_); } void NaturalGradientAffineComponent::Write(std::ostream &os, @@ -2786,26 +2811,39 @@ void NaturalGradientAffineComponent::Write(std::ostream &os, WriteToken(os, binary, ""); bias_params_.Write(os, binary); WriteToken(os, binary, ""); - WriteBasicType(os, binary, rank_in_); + WriteBasicType(os, binary, preconditioner_in_.GetRank()); WriteToken(os, binary, ""); - WriteBasicType(os, binary, rank_out_); + WriteBasicType(os, binary, preconditioner_out_.GetRank()); + BaseFloat d_in = preconditioner_in_.GetDiagonalPower(), + d_out = preconditioner_out_.GetDiagonalPower(); + if (d_in != 0.0 || d_out != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, d_in); + WriteBasicType(os, binary, d_out); + } WriteToken(os, binary, ""); - WriteBasicType(os, binary, update_period_); + WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod()); WriteToken(os, binary, ""); - WriteBasicType(os, binary, num_samples_history_); + WriteBasicType(os, binary, preconditioner_in_.GetNumSamplesHistory()); WriteToken(os, binary, ""); - WriteBasicType(os, binary, alpha_); + WriteBasicType(os, binary, preconditioner_in_.GetAlpha()); WriteToken(os, binary, ""); } std::string NaturalGradientAffineComponent::Info() const { std::ostringstream stream; stream << AffineComponent::Info(); - stream << ", rank-in=" << rank_in_ - << ", rank-out=" << rank_out_ - << ", num-samples-history=" << num_samples_history_ - << ", update-period=" << update_period_ - << ", alpha=" << alpha_; + stream << ", rank-in=" << preconditioner_in_.GetRank() + << ", rank-out=" << preconditioner_out_.GetRank() + << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() + << ", update-period=" << preconditioner_in_.GetUpdatePeriod() + << ", alpha=" << preconditioner_in_.GetAlpha(); + BaseFloat d_in = preconditioner_in_.GetDiagonalPower(), + d_out = preconditioner_out_.GetDiagonalPower(); + if (d_in != 0.0 || d_out != 0.0) { + stream << ", diagonal-power-in=" << d_in + << ", diagonal-power-out=" << d_out; + } return stream.str(); } @@ -2816,15 +2854,8 @@ Component* NaturalGradientAffineComponent::Copy() const { NaturalGradientAffineComponent::NaturalGradientAffineComponent( const NaturalGradientAffineComponent &other): AffineComponent(other), - rank_in_(other.rank_in_), - rank_out_(other.rank_out_), - update_period_(other.update_period_), - num_samples_history_(other.num_samples_history_), - alpha_(other.alpha_), preconditioner_in_(other.preconditioner_in_), - preconditioner_out_(other.preconditioner_out_) { - SetNaturalGradientConfigs(); -} + preconditioner_out_(other.preconditioner_out_) { } void NaturalGradientAffineComponent::Update( const std::string &debug_info, @@ -2917,6 +2948,14 @@ void LinearComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_in); ReadBasicType(is, binary, &rank_out); + if (PeekToken(is, binary) == 'D') { + ExpectToken(is, binary, ""); + BaseFloat d_in, d_out; + ReadBasicType(is, binary, &d_in); + ReadBasicType(is, binary, &d_out); + preconditioner_in_.SetDiagonalPower(d_in); + preconditioner_out_.SetDiagonalPower(d_out); + } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &alpha); ExpectToken(is, binary, ""); @@ -2968,7 +3007,10 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { // Read various natural-gradient-related configs. int32 rank_in = 20, rank_out = 80, update_period = 4; BaseFloat alpha = 4.0, - num_samples_history = 2000.0; + num_samples_history = 2000.0, + diagonal_power_in = 0.0, + diagonal_power_out = 0.0; + use_natural_gradient_ = true; cfl->GetValue("num-samples-history", &num_samples_history); @@ -2977,6 +3019,9 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); cfl->GetValue("use-natural-gradient", &use_natural_gradient_); + cfl->GetValue("diagonal-power-in", &diagonal_power_in); + cfl->GetValue("diagonal-power-out", &diagonal_power_out); + preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); @@ -2986,7 +3031,8 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); - + preconditioner_in_.SetDiagonalPower(diagonal_power_in); + preconditioner_out_.SetDiagonalPower(diagonal_power_out); orthonormal_constraint_ = 0.0; cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); @@ -3013,10 +3059,17 @@ void LinearComponent::Write(std::ostream &os, rank_out = preconditioner_out_.GetRank(), update_period = preconditioner_in_.GetUpdatePeriod(); BaseFloat alpha = preconditioner_in_.GetAlpha(), - num_samples_history = preconditioner_in_.GetNumSamplesHistory(); + num_samples_history = preconditioner_in_.GetNumSamplesHistory(), + d_in = preconditioner_in_.GetDiagonalPower(), + d_out = preconditioner_out_.GetDiagonalPower(); WriteToken(os, binary, ""); WriteBasicType(os, binary, rank_in); WriteBasicType(os, binary, rank_out); + if (d_in != 0.0 || d_out != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, d_in); + WriteBasicType(os, binary, d_out); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, alpha); WriteToken(os, binary, ""); @@ -3036,6 +3089,12 @@ std::string LinearComponent::Info() const { GetVerboseLevel() >= 2); // include_singular_values if (orthonormal_constraint_ != 0.0) stream << ", orthonormal-constraint=" << orthonormal_constraint_; + BaseFloat d_in = preconditioner_in_.GetDiagonalPower(), + d_out = preconditioner_out_.GetDiagonalPower(); + if (d_in != 0.0 || d_out != 0.0) { + stream << ", diagonal-power-in=" << d_in + << ", diagonal-power-out=" << d_out; + } stream << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false") << ", rank-in=" << preconditioner_in_.GetRank() diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index ef2fbd988a5..2432c912e75 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -773,6 +773,13 @@ class LogSoftmaxComponent: public NonlinearComponent { minibatches) we update the Fisher-matrix estimates; making this > 1 saves a little time in training. default=4. + diagonal-power-in, diagonal-power-out + Control a diagonal factor in the natural gradient + factorization, for the input and output spaces + respectively 0.0 = default (old-style natural + gradient), 1.0 = natural gradient with the diagonal + factors; 0.5 is more like a factorized type of + adagrad. */ class NaturalGradientAffineComponent: public AffineComponent { public: @@ -805,15 +812,14 @@ class NaturalGradientAffineComponent: public AffineComponent { int32 update_period_; BaseFloat num_samples_history_; BaseFloat alpha_; + // note: the config values diagonal-power-in and diagonal-power-out + // are stored in the objects preconditioner_in_ and preconditioner_out_ + // directly. OnlineNaturalGradient preconditioner_in_; OnlineNaturalGradient preconditioner_out_; - // Sets the configs rank, alpha and eta in the preconditioner objects, - // from the class variables. - void SetNaturalGradientConfigs(); - virtual void Update( const std::string &debug_info, const CuMatrixBase &in_value, @@ -877,6 +883,13 @@ class NaturalGradientAffineComponent: public AffineComponent { minibatches) we update the Fisher-matrix estimates; making this > 1 saves a little time in training. default=4. + diagonal-power-in, diagonal-power-out + Control a diagonal factor in the natural gradient + factorization, for the input and output spaces + respectively 0.0 = default (old-style natural + gradient), 1.0 = natural gradient with the diagonal + factors; 0.5 is more like a factorized type of + adagrad. */ class LinearComponent: public UpdatableComponent { public: From 6495571fef901a3013c625d2d8b6273fd31d195c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 23 Dec 2017 03:02:01 -0500 Subject: [PATCH 033/184] Revert "[scripts] Fixing bug in fast-lstm-layer and lstmb-layer whereby c was used instead of m for affine transform." This reverts commit 69d0d380b5e14771f0284429d8626f02e6b3f6a1. --- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 27 +++++++++++---------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 3743413ab34..5827ea4d179 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -737,13 +737,17 @@ def _generate_lstm_config(self): configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent " "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, l2_regularize_option)) + # Note from Dan: I don't remember why we are applying the backprop + # truncation on both c and m appended together, instead of just on c. + # Possibly there was some memory or speed or WER reason for it which I + # have forgotten about now. configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} " "{2}".format(name, 2 * cell_dim, bptrunc_str)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " - "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format( + "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format( name, input_descriptor, self.config['recurrence-scale'], delay)) if self.config['self-stabilize']: configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name)) @@ -759,7 +763,6 @@ def _generate_lstm_config(self): configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) if self.layer_type == "fast-lstm-batchnorm-layer": # Add the batchnorm component, if requested to include batchnorm. @@ -911,7 +914,7 @@ def _generate_lstm_config(self): name, input_dim + cell_dim, bottleneck_dim, affine_str)) - configs.append("component name={0}.m_trunc_memnorm type=MemoryNormComponent dim={1} ".format( + configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format( name, cell_dim)) configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " @@ -929,15 +932,15 @@ def _generate_lstm_config(self): l2_regularize_option)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( - name, 2 * cell_dim, bptrunc_str)) + configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format( + name, cell_dim, bptrunc_str)) configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( name, cell_dim)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset(Scale(1.0, {0}.m_trunc_memnorm), {2})))".format( + "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format( name, input_descriptor, delay)) configs.append("component-node name={0}.W_all_b component={0}.W_all_b " "input={0}.W_all_a".format(name)) @@ -947,15 +950,13 @@ def _generate_lstm_config(self): configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( name, delay)) - configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " - "dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 " + configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 " "dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} " + configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " "dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.m_trunc_memnorm component={0}.m_trunc_memnorm " - "input={0}.m_trunc".format(name)) + configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name)) + configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm " + "input={0}.c_trunc".format(name)) configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) From f4c5e3d16a0920ee4f9b03917f38b69b374273b7 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 23 Dec 2017 16:29:24 -0500 Subject: [PATCH 034/184] [scripts] Add bottleneck-dim to xconfig basic layers and output layers. --- .../steps/libs/nnet3/xconfig/basic_layers.py | 194 +++++++++++------- 1 file changed, 120 insertions(+), 74 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index c8a71e15672..e62a090c25e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -402,8 +402,7 @@ def get_full_config(self): # the input layers need to be printed in 'init.config' (which # initializes the neural network prior to the LDA), in 'ref.config', # which is a version of the config file used for getting left and right - # context (it doesn't read anything for the LDA-like transform and/or - # presoftmax-prior-scale components) + # context (it doesn't read anything for the LDA-like transform). # In 'full.config' we write everything, this is just for reference, # and also for cases where we don't use the LDA-like transform. ans = [] @@ -430,6 +429,9 @@ class XconfigOutputLayer(XconfigLayerBase): Parameters of the class, and their defaults: input='[-1]' : Descriptor giving the input of the layer. dim=None : Output dimension of layer, will normally equal the number of pdfs. + bottleneck-dim=None : Bottleneck dimension of layer: if supplied, instead of + an affine component we'll have a linear then affine, so a linear + bottleneck, with the linear part constrained to be orthonormal. include-log-softmax=true : setting it to false will omit the log-softmax component- useful for chain models. objective-type=linear : the only other choice currently is @@ -441,16 +443,6 @@ class XconfigOutputLayer(XconfigLayerBase): learning-rate-factor=(0.5/xent_regularize), normally learning-rate-factor=5.0 since xent_regularize is normally 0.1. - presoftmax-scale-file=None : If set, a filename for a vector that - will be used to scale the output of the affine component before the - log-softmax (if include-log-softmax=true), or before the output - (if not). This is helpful to avoid instability in training due to - some classes having much more data than others. The way we normally - create this vector is to take the priors of the classes to the - power -0.25 and rescale them so the average is 1.0. This factor - -0.25 is referred to as presoftmax_prior_scale_power in scripts. In - the scripts this would normally be set to - config_dir/presoftmax_prior_scale.vec max-change=1.5 : Can be used to change the max-change parameter in the affine component; this affects how much the matrix can change on each iteration. @@ -462,6 +454,9 @@ class XconfigOutputLayer(XconfigLayerBase): ng-affine-options='' : Can be used supply non-default options to the affine layer (intended for the natural gradient but can be an arbitrary string to be added to the config line. e.g. 'update-period=2'.). + ng-linear-options='' : Options, like ng-affine-options, that are passed to + the LinearComponent, only in bottleneck layers (i.e. if bottleneck-dim + is supplied). """ def __init__(self, first_token, key_to_value, prev_names=None): @@ -475,13 +470,13 @@ def set_default_configs(self): # the most recent layer. self.config = {'input': '[-1]', 'dim': -1, + 'bottleneck-dim': -1, 'include-log-softmax': True, # this would be false for chain models 'objective-type': 'linear', # see Nnet::ProcessOutputNodeConfigLine in # nnet-nnet.cc for other options 'learning-rate-factor': 1.0, - 'presoftmax-scale-file': '', # used in DNN (not RNN) training when using # frame-level objfns, 'max-change': 1.5, @@ -489,7 +484,8 @@ def set_default_configs(self): 'bias-stddev': 0.0, 'l2-regularize': 0.0, 'output-delay': 0, - 'ng-affine-options': '' + 'ng-affine-options': '', + 'ng-linear-options': '' # only affects bottleneck output layers. } def check_configs(self): @@ -533,8 +529,20 @@ def output_dim(self, auxiliary_output=None): " layers") def get_full_config(self): - ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in LSTM initialization + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + + def _generate_config(self): + + configs = [] # note: each value of self.descriptors is (descriptor, dim, # normalized-string, output-string). @@ -543,10 +551,10 @@ def get_full_config(self): descriptor_final_string = self.descriptors['input']['final-string'] input_dim = self.descriptors['input']['dim'] output_dim = self.config['dim'] + bottleneck_dim = self.config['bottleneck-dim'] objective_type = self.config['objective-type'] learning_rate_factor = self.config['learning-rate-factor'] include_log_softmax = self.config['include-log-softmax'] - presoftmax_scale_file = self.config['presoftmax-scale-file'] param_stddev = self.config['param-stddev'] bias_stddev = self.config['bias-stddev'] l2_regularize = self.config['l2-regularize'] @@ -558,64 +566,70 @@ def get_full_config(self): l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) if l2_regularize != 0.0 else '') - # note: ref.config is used only for getting the left-context and - # right-context of the network; - # final.config is where we put the actual network definition. - for config_name in ['ref', 'final']: - # First the affine node. - line = ('component name={0}.affine' - ' type=NaturalGradientAffineComponent' - ' input-dim={1}' - ' output-dim={2}' - ' param-stddev={3}' - ' bias-stddev={4}' - ' max-change={5} {6} {7} {8}' - ''.format(self.name, input_dim, output_dim, - param_stddev, bias_stddev, max_change, ng_affine_options, - learning_rate_option, l2_regularize_option)) - ans.append((config_name, line)) - - line = ('component-node name={0}.affine' - ' component={0}.affine input={1}' - ''.format(self.name, descriptor_final_string)) - ans.append((config_name, line)) - cur_node = '{0}.affine'.format(self.name) - - if presoftmax_scale_file is not '' and config_name == 'final': - # don't use the presoftmax-scale in 'ref.config' since that - # file won't exist at the time we evaluate it. - # (ref.config is used to find the left/right context). - line = ('component name={0}.fixed-scale' - ' type=FixedScaleComponent scales={1}' - ''.format(self.name, presoftmax_scale_file)) - ans.append((config_name, line)) - - line = ('component-node name={0}.fixed-scale' - ' component={0}.fixed-scale input={1}' - ''.format(self.name, cur_node)) - ans.append((config_name, line)) - cur_node = '{0}.fixed-scale'.format(self.name) + cur_node = descriptor_final_string + cur_dim = input_dim + + if bottleneck_dim >= 0: + if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim: + raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format( + bottleneck_dim)) + # This is the bottleneck case (it doesn't necessarily imply we + # will be using the features from the bottleneck; it's just a factorization + # of the matrix into two pieces without a nonlinearity in between). + # We don't include the l2-regularize option because it's useless + # given the orthonormality constraint. + linear_options = self.config['ng-linear-options'] + + # note: by default the LinearComponent uses natural gradient. + line = ('component name={0}.linear type=LinearComponent ' + 'orthonormal-constraint=1.0 input-dim={1} output-dim={2} ' + 'max-change=0.75 {3}' + ''.format(self.name, input_dim, bottleneck_dim, linear_options)) + configs.append(line) + line = ('component-node name={0}.linear component={0}.linear input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.linear'.format(self.name) + cur_dim = bottleneck_dim + + + line = ('component name={0}.affine' + ' type=NaturalGradientAffineComponent' + ' input-dim={1}' + ' output-dim={2}' + ' param-stddev={3}' + ' bias-stddev={4}' + ' max-change={5} {6} {7} {8}' + ''.format(self.name, cur_dim, output_dim, + param_stddev, bias_stddev, max_change, ng_affine_options, + learning_rate_option, l2_regularize_option)) + configs.append(line) + line = ('component-node name={0}.affine' + ' component={0}.affine input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.affine'.format(self.name) - if include_log_softmax: - line = ('component name={0}.log-softmax' - ' type=LogSoftmaxComponent dim={1}' - ''.format(self.name, output_dim)) - ans.append((config_name, line)) + if include_log_softmax: + line = ('component name={0}.log-softmax' + ' type=LogSoftmaxComponent dim={1}' + ''.format(self.name, output_dim)) + configs.append(line) - line = ('component-node name={0}.log-softmax' - ' component={0}.log-softmax input={1}' - ''.format(self.name, cur_node)) - ans.append((config_name, line)) - cur_node = '{0}.log-softmax'.format(self.name) + line = ('component-node name={0}.log-softmax' + ' component={0}.log-softmax input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.log-softmax'.format(self.name) - if output_delay != 0: - cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) + if output_delay != 0: + cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) - line = ('output-node name={0} input={1} ' - 'objective={2}'.format( - self.name, cur_node, objective_type)) - ans.append((config_name, line)) - return ans + line = ('output-node name={0} input={1} ' + 'objective={2}'.format( + self.name, cur_node, objective_type)) + configs.append(line) + return configs class XconfigBasicLayer(XconfigLayerBase): @@ -657,9 +671,11 @@ def set_default_configs(self): # the most recent layer. self.config = {'input': '[-1]', 'dim': -1, + 'bottleneck-dim': -1, 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, 'ng-affine-options': '', + 'ng-linear-options': '', # only affects bottleneck layers. 'dropout-proportion': 0.5, # dropout-proportion only # affects layers with # 'dropout' in the name. @@ -674,6 +690,10 @@ def set_default_configs(self): def check_configs(self): if self.config['dim'] < 0: raise RuntimeError("dim has invalid value {0}".format(self.config['dim'])) + b = self.config['bottleneck-dim'] + if b >= 0 and (b >= self.config['dim'] or b == 0): + raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b)) + if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: raise RuntimeError("self-repair-scale has invalid value {0}" .format(self.config['self-repair-scale'])) @@ -751,14 +771,40 @@ def _add_components(self, input_desc, input_dim, nonlinearities): "there is a final 'renorm' component.") configs = [] - # First the affine node. + cur_dim = input_dim + cur_node = input_desc + + # First the affine node (or linear then affine, if bottleneck). + if self.config['bottleneck-dim'] > 0: + # This is the bottleneck case (it doesn't necessarily imply we + # will be using the features from the bottleneck; it's just a factorization + # of the matrix into two pieces without a nonlinearity in between). + # We don't include the l2-regularize option because it's useless + # given the orthonormality constraint. + linear_options = self.config['ng-linear-options'] + for opt_name in [ 'max-change', 'learning-rate-factor' ]: + value = self.config[opt_name] + if value != '': + linear_options += ' {0}={1}'.format(opt_name, value) + bottleneck_dim = self.config['bottleneck-dim'] + # note: by default the LinearComponent uses natural gradient. + line = ('component name={0}.linear type=LinearComponent ' + 'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}' + ''.format(self.name, input_dim, bottleneck_dim, linear_options)) + configs.append(line) + line = ('component-node name={0}.linear component={0}.linear input={1}' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.linear'.format(self.name) + cur_dim = bottleneck_dim + + line = ('component name={0}.affine type=NaturalGradientAffineComponent' ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, input_dim, output_dim, affine_options)) + ''.format(self.name, cur_dim, output_dim, affine_options)) configs.append(line) - line = ('component-node name={0}.affine component={0}.affine input={1}' - ''.format(self.name, input_desc)) + ''.format(self.name, cur_node)) configs.append(line) cur_node = '{0}.affine'.format(self.name) From 421a062477d732fc02e2109b9d50857ae0f18661 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 27 Dec 2017 00:03:33 -0500 Subject: [PATCH 035/184] [src] Some bug fixes; change to natural-gradient-online RE last dim --- egs/mini_librispeech/s5/local/chain/compare_wer.sh | 6 ++++++ src/nnet3/natural-gradient-online.cc | 12 +++++++++++- src/rnnlm/rnnlm-embedding-training.cc | 10 ++++------ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chain/compare_wer.sh b/egs/mini_librispeech/s5/local/chain/compare_wer.sh index cd6be14ed88..8ee5db2326a 100755 --- a/egs/mini_librispeech/s5/local/chain/compare_wer.sh +++ b/egs/mini_librispeech/s5/local/chain/compare_wer.sh @@ -129,3 +129,9 @@ for x in $*; do printf "% 10s" $prob done echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index 4c4d5a1b888..83702626f5f 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -18,6 +18,7 @@ // limitations under the License. #include "nnet3/natural-gradient-online.h" +#include "nnet3/nnet-parse.h" namespace kaldi { namespace nnet3 { @@ -286,7 +287,16 @@ void OnlineNaturalGradient::UpdateDiagonalScale() { if (diagonal_scale_.Dim() != dim) diagonal_scale_.Resize(dim); diagonal_scale_.CopyFromVec(diagonal_uvar_); - diagonal_scale_.AddVecVec(-1.0, diagonal_mean_, diagonal_mean_, 1.0); + // Because the last element may be the offset, and it doesn't + // make sense to subtract its mean for this purpose, only do this for + // the all-but-last-elements. + if (dim > 1) { + CuSubVector diagonal_scale_part(diagonal_scale_, 0, dim - 1), + diagonal_mean_part(diagonal_mean_, 0, dim - 1); + diagonal_scale_part.AddVecVec(-1.0, diagonal_mean_part, + diagonal_mean_part, 1.0); + } + // At this point, diagonal_scale_ is the diagonal of the (centered) variance // estimated from the x and x2 statistics, prior to any flooring or // scaling. diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc index 00d939da5be..47b347047fb 100644 --- a/src/rnnlm/rnnlm-embedding-training.cc +++ b/src/rnnlm/rnnlm-embedding-training.cc @@ -77,12 +77,11 @@ void RnnlmEmbeddingTrainer::Train( if (l2_term != 0.0) { embedding_deriv->AddMat(l2_term, *embedding_mat_); } - } + } BaseFloat scale = 1.0; if (config_.use_natural_gradient) { - preconditioner_.PreconditionDirections(embedding_deriv, NULL, - &scale); + preconditioner_.PreconditionDirections(embedding_deriv, &scale); } scale *= config_.learning_rate; num_minibatches_++; @@ -130,11 +129,10 @@ void RnnlmEmbeddingTrainer::Train( if (l2_term != 0.0) { embedding_deriv->AddMat(l2_term, *embedding_mat_); } - } + } BaseFloat scale = 1.0; if (config_.use_natural_gradient) { - preconditioner_.PreconditionDirections(embedding_deriv, NULL, - &scale); + preconditioner_.PreconditionDirections(embedding_deriv, &scale); } scale *= config_.learning_rate; num_minibatches_++; From e71ddae4fca84f8f10efb30e578d50030de5259c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 27 Dec 2017 19:24:36 -0800 Subject: [PATCH 036/184] [src] Reorganize batch-norm code and add power option. --- src/nnet3/nnet-normalize-component.cc | 233 ++++++++++++++------------ src/nnet3/nnet-normalize-component.h | 15 +- src/nnet3/nnet-test-utils.cc | 1 + 3 files changed, 135 insertions(+), 114 deletions(-) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index 1e3314bf91f..3f105cd8e2b 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -234,8 +234,8 @@ void BatchNormComponent::ComputeDerived() { // of numerical roundoff. scale_.ApplyFloor(0.0); scale_.Add(epsilon_); - scale_.ApplyPow(-0.5); - // now scale_ = min(variance, epsilon)^{-0.5}. + scale_.ApplyPow(power_); + // now scale_ = min(variance, epsilon)^power_ // next, multiply by the target RMS (normally 1.0). scale_.Scale(target_rms_); offset_.MulElements(scale_); @@ -253,10 +253,10 @@ void BatchNormComponent::Check() const { } BatchNormComponent::BatchNormComponent(const BatchNormComponent &other): - dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), - target_rms_(other.target_rms_), test_mode_(other.test_mode_), - count_(other.count_), stats_sum_(other.stats_sum_), - stats_sumsq_(other.stats_sumsq_) { + dim_(other.dim_), block_dim_(other.block_dim_), power_(other.power_), + epsilon_(other.epsilon_), target_rms_(other.target_rms_), + test_mode_(other.test_mode_), count_(other.count_), + stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) { ComputeDerived(); Check(); } @@ -267,6 +267,7 @@ std::string BatchNormComponent::Info() const { stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ << ", count=" << count_ + << ", power=" << power_ << ", test-mode=" << (test_mode_ ? "true" : "false"); if (count_ > 0) { Vector mean(stats_sum_), var(stats_sumsq_); @@ -285,12 +286,14 @@ std::string BatchNormComponent::Info() const { void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { dim_ = -1; block_dim_ = -1; + power_ = -0.5; epsilon_ = 1.0e-03; target_rms_ = 1.0; test_mode_ = false; bool ok = cfl->GetValue("dim", &dim_); cfl->GetValue("block-dim", &block_dim_); cfl->GetValue("epsilon", &epsilon_); + cfl->GetValue("power", &power_); cfl->GetValue("target-rms", &target_rms_); cfl->GetValue("test-mode", &test_mode_); if (!ok || dim_ <= 0) { @@ -304,6 +307,8 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); + if (power_ >= 0 || power_ <= -1.0) + KALDI_ERR << "Power has invalid value " << power_; count_ = 0; stats_sum_.Resize(block_dim_); stats_sumsq_.Resize(block_dim_); @@ -325,95 +330,72 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { FORWARD PASS: - Define xsum = sum_i x(i) - x2sum = sum_i x(i)^2 - mean = xsum / n - var = x2sum / n - (mean*mean) - scale = sqrt(var + epsilon)^{-0.5} - offset = -mean * scale + Let 'power' be a constant, equal to -0.5 for regular batch-norm. + + To simplify the math we (conceptually, not physically) do the normalization in + two stages: first mean, then variance, so we have x(i) -> y(i) -> z(i). + + The name 'rscale' means 'raw scale', meaning the scale before including + target-rms. Later we'll define 'scale = target-rms * rscale', to make some + of the actual computations slightly more efficient. + + Define: mean = 1/I * sum_i x(i) + y(i) = x(i) - mean - y(i) = scale * x(i) + offset + var = 1/I \sum_i y(i)^2 + rscale = sqrt(var + epsilon)^power <---- For regular batchnorm, power == -0.5. + z(i) = target-rms * rscale * y(i) - Most of the rest of this comment derives how to compute the derivatives. If - you just want the formulas, please skip to the string 'BACKWARD PASS' below. + + Most of the rest of this comment derives how to compute the derivatives. If + you just want the formulas, please skip to the string 'BACKWARD PASS' below. We'll use a notation where an apostrophe on something means (the derivative of the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on. We are given y'(i). Propagating the derivatives backward: - offset' = sum_i y'(i) - scale' = (sum_i y'(i) * x(i)) - offset' * mean - var' = scale' * -0.5 * sqrt(var + epsilon)^{-1.5} - = -0.5 * scale' * scale^3 - mean' = -offset' * scale - 2 * mean * var' - xsum' = mean' / n - x2sum' = var' / n - - So the derivatives propagated back to the original data are: - x'(i) = y'(i) * scale + xsum' + x(i) * x2sum' - - The above is quite complicated to compute, but we can use some invariances - to work out a simpler way to compute the derivatives. - - Firstly, note that x'(i) is of the form: - - x'(i) = y'(i) * scale + [affine function of x(i)]. - - [it's a 1-d affine function, i.e. offset and scale]. - This has the same functional form as: - - x'(i) = y'(i) * scale + [affine function of y(i)]. - - since y(i) is an affine function of x(i) with nonzero scale. - Because the output is invariant to shifts in the input, sum_i x'(i) - will be zero. This is sufficient to determine the bias - term in the affine function. [Note: the scale on y(i) doesn't - come into it because the y(i) sum to zero]. The offset - will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero. - So let's write it as - - x'(i) = (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i). - - and it will be convenient to define: - - x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - - which is just y'(i) with mean subtraction, scaled according to - the scale used in the normalization. So write - - x'(i) = x_deriv_base(i) + alpha y(i). - - The question is, what is the scale alpha. We don't actually need to - do any differentiation to figure this out. First, assume there is - no "+ epsilon" in the variance; later we'll explain why this doesn't - matter. The key to working out alpha is that the output is invariant - to scaling of the input. Assume we scale around the input's mean, - since that makes the math simpler. We can express this by the - constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0. This is - equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since - y(i) is x(i) - avg-x times a nonzero scale. We'll use this contraint - to determine alpha, Using the above expressionfor x(i), we can write - this constraint as: - \sum_i ( y(i) x_deriv_base(i) + alpha y(i) y(i)) = 0. - Now, since we said we'd ignore the epsilon, the output has unit variance, - so we know that \sum_i y(i) y(i) = n. - So alpha = - \sum_i y(i) x_deriv_base(i) / n. We can actually re-imagine - the epsilon term (or variance-flooring) as having been implemented by - adding a couple extra rows to the matrix with suitable values, and zero - output-deriv for those rows. If you think about it carefully you'll see that - the formula above is valid even if there is an extra term - in the variance. Anyway the correctness of the derivative will get tested - throughly by the component unit-tests. - - So to recap, here is the backprop. - - BACKWARD PASS: - - We are given y'(i), scale, and y(i). - - We compute: - x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - alpha = - \sum_i y(i) x_deriv_base(i) / n - x'(i) = x_deriv_base(i) + alpha y(i) + + rscale' = (sum_i y(i) z'(i)) * target-rms + = (sum_i z(i) z'(i)) / rscale + + [ note: d(rscale)/d(var) = power * (var + epsilon)^{power - 1} + = power * rscale^{(power-1)/power} ] + + var' = rscale' * power * rscale^{(power-1)/power} + = power * (\sum_i z'(i) z(i)) * rscale^{(power-1)/power - 1} + = power * (\sum_i z'(i) z(i)) * rscale^{-1/power} + + [note: the following formula is of the form "direct term" + "indirect term"] + y'(i) = z'(i) * target-rms * rscale + 2/I y(i) var' + + Now, the above is inconvenient because it contains y(i) which is an intermediate + quantity. We reformulate in terms of z(i), using y(i) = z(i) / (target-rms * rscale), so: + + defining + var_deriv_mod = 2/I * var' / (target-rms * rscale) + = 2/I * power/target-rms * (\sum_i z'(i) z(i)) * rscale^{-(1+power)/power} + we have: + y'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod + + Now, + mean' = \sum_i y'(i) + = (target-rms * rscale * \sum_i z'(i)) + (var_deriv_mod \sum_i z(i)) + [... and the 2nd term above is zero when summed over i, because \sum_i z(i) is zero, ...] + = target-rms * rscale * \sum_i z(i) + and: + x'(i) = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I mean' + = z'(i) * target-rms * rscale + z(i) var_deriv_mod - 1/I * target-rms * rscale * \sum_i z'(i) + = target-rms * rscale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod + + It will simplify the code if we define: + + scale = target-rms * rscale. This way, we can write as follows: + + BACKWARD PASS (recap): + + var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power} + + x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod + */ @@ -446,7 +428,7 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, Memo *memo = new Memo; int32 num_frames = in.NumRows(), dim = block_dim_; memo->num_frames = num_frames; - memo->mean_uvar_scale.Resize(4, dim); + memo->mean_uvar_scale.Resize(5, dim); CuSubVector mean(memo->mean_uvar_scale, 0), uvar(memo->mean_uvar_scale, 1), scale(memo->mean_uvar_scale, 2); @@ -454,14 +436,14 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); scale.CopyFromVec(uvar); // by applying this scale at this point, we save a multiply later on. - BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); + BaseFloat var_scale = std::pow(target_rms_, -power_); scale.AddVecVec(-var_scale, mean, mean, var_scale); - // at this point, 'scale' contains just the variance [divided by target-rms^2]. + // at this point, 'scale' contains just the variance (times target-rms^{-power}) scale.ApplyFloor(0.0); scale.Add(var_scale * epsilon_); // Now 'scale' contains the variance floored to zero and then with epsilon - // added [both divided by target-rms^2]. - scale.ApplyPow(-0.5); + // added [both times target-rms^{-power}] + scale.ApplyPow(power_); // now 'scale' is the actual scale we'll use. // the next command will do no work if out == in, for in-place propagation. @@ -525,26 +507,47 @@ void BatchNormComponent::Backprop( KALDI_ASSERT(memo != NULL && "memo not passed into backprop"); int32 num_frames = memo->num_frames; KALDI_ASSERT(out_value.NumRows() == num_frames); - CuSubVector temp(memo->mean_uvar_scale, 3), - scale(memo->mean_uvar_scale, 2); + CuSubVector + scale(memo->mean_uvar_scale, 2), + temp(memo->mean_uvar_scale, 4), + var_deriv_mod(memo->mean_uvar_scale, 3), + scale_pow(memo->mean_uvar_scale, 4); + + // var_deriv_mod is going to contain: + // 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power} + // but for now we don't have the power of 'scale', we'll add that later. + BaseFloat coeff = 2.0 * power_ * std::pow(target_rms_, 1.0 / power_) / + num_frames; + var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans, + out_deriv, kNoTrans, 0.0); + + temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); - // the following does no work if in_deriv and out_deriv are the same matrix. + // the following statement does no work if in_deriv and out_deriv are the same matrix. in_deriv->CopyFromMat(out_deriv); in_deriv->AddVecToRows(1.0, temp); + // At this point, *in_deriv contains + // (z'(i) - 1/I * \sum_i z'(i)) in_deriv->MulColsVec(scale); - // at this point, 'in_deriv' contains: - // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale - temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_), - out_value, kTrans, *in_deriv, kNoTrans, 0.0); - // now, 'temp' contains the quantity which we described - // in the math as: - // alpha = - \sum_i y(i) x_deriv_base(i) / n. - // The factor 1 / (target_rms_ * target_rms_) comes from following - // this additional scaling factor through the math. In the comment I said - // "we know that \sum_i y(i) y(i) = n". Taking target-rms into account - // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2". - in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0); - // At this point, in_deriv contains x'(i) = x_deriv_base(i) + alpha y(i). + // At this point, *in_deriv contains + // scale * (z'(i) - 1/I * \sum_i z'(i)) + + // The next few lines complete the calculation of 'var_deriv_mod'; + // we delayed it because we were using 'temp', and 'scale_pow' + // uses the same memory. + if (power_ == -0.5) { + // we can simplify scale^{-(1+power)/power} to just 'scale'. + var_deriv_mod.MulElements(scale); + } else { + scale_pow.CopyFromVec(scale); + scale_pow.ApplyPow(-1.0 * (1.0 + power_) / power_); + var_deriv_mod.MulElements(scale_pow); + } + in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, + var_deriv_mod, 1.0); + // At this point, *in_deriv contains what we described in the comment + // starting BATCHNORM_MATH as: + // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod } else { KALDI_ASSERT(offset_.Dim() == block_dim_); // the next call does no work if they point to the same memory. @@ -598,6 +601,12 @@ void BatchNormComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &block_dim_); + if (PeekToken(is, binary) == 'P') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &power_); + } else { + power_ = -0.5; + } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &epsilon_); ExpectToken(is, binary, ""); @@ -625,6 +634,10 @@ void BatchNormComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, block_dim_); + if (power_ != -0.5) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, power_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, epsilon_); WriteToken(os, binary, ""); diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index 5299862ee65..84b5dbd817a 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -227,13 +227,11 @@ class BatchNormComponent: public Component { struct Memo { // number of frames (after any reshaping). int32 num_frames; - // 'sum_sumsq_scale' is of dimension 4 by block_dim_: + // 'sum_sumsq_scale' is of dimension 5 by block_dim_: // Row 0 = mean = the mean of the rows of the input // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). // Row 2 = scale = the scale of the renormalization, which is - // Row 3 is used as a temporary in Backprop. - // the inverse stddev of the input (modified by epsilon_, - // see the Propagate function. + // Rows 3 and 4 are used as a temporaries in Backprop. CuMatrix mean_uvar_scale; }; @@ -260,6 +258,12 @@ class BatchNormComponent: public Component { // always will in the new code in nnet-convolutional-component.h. int32 block_dim_; + + // This power determines the scale as a power of the variance... the default + // (-0.5) corresponds to regular BatchNorm, but you can set it to other + // values, like -0.25 or -0.4, for what we'll call "fractional BatchNorm" + BaseFloat power_; + // Used to avoid exact-zero variances, epsilon has the dimension of a // covariance. BaseFloat epsilon_; @@ -311,6 +315,9 @@ class BatchNormComponent: public Component { is treated like a separate row of the input matrix, which means that the stats from n'th element of each block are pooled into one class, for each n.a + power Power that determines the scale we apply, as a function of + the variance. The default, -0.5, corresponds to regular + BatchNorm. epsilon Small term added to the variance that is used to prevent division by zero target-rms This defaults to 1.0, but if set, for instance, to 2.0, diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 83b902a9b90..781b4e558e9 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1681,6 +1681,7 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " block-dim=" << block_dim << " target-rms=" << RandInt(1, 2) << " test-mode=" << (test_mode ? "true" : "false") + << " power=" << (-0.1 * RandInt(3, 5)) << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0"); break; } From 6870287eaa9d43ebec5e360c124808e00b130804 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 27 Dec 2017 21:29:26 -0800 Subject: [PATCH 037/184] [src] Fix bug in normalize-component RE target-rms --- src/nnet3/nnet-normalize-component.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index 3f105cd8e2b..ad5fc2466d4 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -320,7 +320,7 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { /* - BATCH_NORM_MATH + BATCHNORM_MATH This comment describes the equations involved in batch normalization, and derives the forward and back-propagation. @@ -436,7 +436,7 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); scale.CopyFromVec(uvar); // by applying this scale at this point, we save a multiply later on. - BaseFloat var_scale = std::pow(target_rms_, -power_); + BaseFloat var_scale = std::pow(target_rms_, 1.0 / power_); scale.AddVecVec(-var_scale, mean, mean, var_scale); // at this point, 'scale' contains just the variance (times target-rms^{-power}) scale.ApplyFloor(0.0); @@ -523,7 +523,8 @@ void BatchNormComponent::Backprop( temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); - // the following statement does no work if in_deriv and out_deriv are the same matrix. + // the following statement does no work if in_deriv and out_deriv are the + // same matrix. in_deriv->CopyFromMat(out_deriv); in_deriv->AddVecToRows(1.0, temp); // At this point, *in_deriv contains From af1817591e847f0fcca0bdaa16ae3c271fc4ce1d Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 29 Dec 2017 15:41:49 -0500 Subject: [PATCH 038/184] [scripts] Add batchnorm-power and diagonal-power options to basic_layers.py --- egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index e62a090c25e..09681625034 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -674,6 +674,7 @@ def set_default_configs(self): 'bottleneck-dim': -1, 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, + 'batchnorm-power': -0.5, 'ng-affine-options': '', 'ng-linear-options': '', # only affects bottleneck layers. 'dropout-proportion': 0.5, # dropout-proportion only @@ -685,6 +686,8 @@ def set_default_configs(self): 'bias-stddev': '', 'l2-regularize': '', 'learning-rate-factor': '', + 'diagonal-power-in': '', + 'diagonal-power-out': '', 'max-change': 0.75 } def check_configs(self): @@ -753,10 +756,12 @@ def _add_components(self, input_desc, input_dim, nonlinearities): output_dim = self.output_dim() self_repair_scale = self.config['self-repair-scale'] target_rms = self.config['target-rms'] + batchnorm_power = self.config['batchnorm-power'] affine_options = self.config['ng-affine-options'] for opt_name in [ 'max-change', 'learning-rate-factor', - 'bias-stddev', 'l2-regularize' ]: + 'bias-stddev', 'l2-regularize', + 'diagonal-power-in', 'diagonal-power-out' ]: value = self.config[opt_name] if value != '': affine_options += ' {0}={1}'.format(opt_name, value) @@ -844,9 +849,9 @@ def _add_components(self, input_desc, input_dim, nonlinearities): elif nonlinearity == 'batchnorm': line = ('component name={0}.{1}' ' type=BatchNormComponent dim={2}' - ' target-rms={3}' + ' target-rms={3} power={4}' ''.format(self.name, nonlinearity, output_dim, - target_rms)) + target_rms, batchnorm_power)) elif nonlinearity == 'memnorm': line = ('component name={0}.{1}' From 8368ab4e7320862c56205e6b1cba26cd8d2240e7 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 29 Dec 2017 18:44:17 -0800 Subject: [PATCH 039/184] [src,scripts] Remove MemoryNormComponent --- .../steps/libs/nnet3/xconfig/basic_layers.py | 7 - egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 7 +- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 - src/nnet3/nnet-component-itf.cc | 2 - src/nnet3/nnet-component-test.cc | 6 +- src/nnet3/nnet-compute.cc | 6 +- src/nnet3/nnet-normalize-component.cc | 527 ------------------ src/nnet3/nnet-normalize-component.h | 245 +------- src/nnet3/nnet-simple-component.cc | 6 - src/nnet3/nnet-test-utils.cc | 4 +- src/nnet3/nnet-utils.cc | 9 +- src/nnet3/nnet-utils.h | 10 +- 12 files changed, 22 insertions(+), 808 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 09681625034..6393709d82e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -853,13 +853,6 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ''.format(self.name, nonlinearity, output_dim, target_rms, batchnorm_power)) - elif nonlinearity == 'memnorm': - line = ('component name={0}.{1}' - ' type=MemoryNormComponent dim={2}' - ' target-rms={3} ' - ''.format(self.name, nonlinearity, output_dim, - target_rms)) - elif nonlinearity == 'so': line = ('component name={0}.{1}' ' type=ScaleAndOffsetComponent dim={2} max-change=0.5 ' diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 5827ea4d179..d226a891113 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -914,9 +914,6 @@ def _generate_lstm_config(self): name, input_dim + cell_dim, bottleneck_dim, affine_str)) - configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format( - name, cell_dim)) - configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, affine_str, l2_regularize_option)) @@ -940,7 +937,7 @@ def _generate_lstm_config(self): configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format( + "IfDefined(Offset({0}.c_trunc, {2})))".format( name, input_descriptor, delay)) configs.append("component-node name={0}.W_all_b component={0}.W_all_b " "input={0}.W_all_a".format(name)) @@ -955,8 +952,6 @@ def _generate_lstm_config(self): configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " "dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name)) - configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm " - "input={0}.c_trunc".format(name)) configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index db9550818cd..71205961681 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -25,7 +25,6 @@ 'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer, 'relu-dropout-layer': xlayers.XconfigBasicLayer, 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, - 'relu-memnorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, 'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer, 'sigmoid-layer' : xlayers.XconfigBasicLayer, diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index f83ad26f375..8a52b7b788c 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -164,8 +164,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new LstmNonlinearityComponent(); } else if (component_type == "BatchNormComponent") { ans = new BatchNormComponent(); - } else if (component_type == "MemoryNormComponent") { - ans = new MemoryNormComponent(); } else if (component_type == "TimeHeightConvolutionComponent") { ans = new TimeHeightConvolutionComponent(); } else if (component_type == "RestrictedAttentionComponent") { diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc index d7595378c1f..882ef112919 100644 --- a/src/nnet3/nnet-component-test.cc +++ b/src/nnet3/nnet-component-test.cc @@ -274,7 +274,7 @@ bool TestSimpleComponentDataDerivative(const Component &c, int32 input_dim = c.InputDim(), output_dim = c.OutputDim(), - num_rows = RandInt(1, 100), + num_rows = RandInt(1, 20), rand_seed = Rand(); int32 properties = c.Properties(); CuMatrix input_data(num_rows, input_dim, kSetZero, input_stride_type), @@ -317,7 +317,7 @@ bool TestSimpleComponentDataDerivative(const Component &c, } KALDI_LOG << "Predicted objf-change = " << predicted_objf_change; KALDI_LOG << "Measured objf-change = " << measured_objf_change; - BaseFloat threshold = 0.1; + BaseFloat threshold = 0.05; bool ans = ApproxEqual(predicted_objf_change, measured_objf_change, threshold); if (!ans) KALDI_WARN << "Data-derivative test failed, component-type=" @@ -442,7 +442,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, void UnitTestNnetComponent() { - for (int32 n = 0; n < 200; n++) { + for (int32 n = 0; n < 2000; n++) { Component *c = GenerateRandomSimpleComponent(); KALDI_LOG << c->Info(); TestNnetComponentIo(c); diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 12a4ec65ae9..23286211301 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -268,9 +268,9 @@ void NnetComputer::ExecuteCommand() { nnet_to_update = (computation_.need_model_derivative ? nnet_to_update_ : NULL); } else { - // Some non-updatable components, such as CompositeComponent and - // MemoryNormComponent, store stats in the backprop. For other - // types of component, this arg won't matter. + // Some non-updatable components, such as CompositeComponent, store + // stats in the backprop. For other types of non-updatable + // component, this arg won't matter. nnet_to_update = nnet_to_store_stats_; } if (nnet_to_update) diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index ad5fc2466d4..e6be8210bb0 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -697,532 +697,5 @@ void BatchNormComponent::ZeroStats() { } - - -/** - MEMORY_NORM_MATH - - This comment describes the equations involved in 'memory-norm'. - memory-norm is like batch normalization, except instead of computing - everything on the current minibatch, we deal with decaying averages - over time, interpreted as expectations. We'll firm up the math later. - The idea is to obtain a form of batch-norm that is compatible with - use in recurrent neural nets. - - Everything is dimension by dimension here, so let's imagine the input and - output are one-dimensional. Any index 'i' is going to be like a frame index - or an index referring to a sample. We'll be writing down some expectations, - and we're rather cavalier with notation; these basically mean - exponentially-decaying weighted averages over time. - - The input will be x(i), and the output y(i). - - Each frame will have a weight, w(i) >= 0. (these will be part of the - decaying averages)... - - Let's define - count = \sum_i w(i) - sum = \sum_i w(i) x(i) - sumsq = \sum_i w(i) x(i)^2 - - We can compute: - mean = sum / count - var = epsilon + (sumsq / count) - (mean * mean) - scale = target_rms * var^{-0.5} - - y(i) = (x(i) - mean) * scale. - - We are given the derivatives of the objective function w.r.t. the - outputs; we'll write these as y'(i) [CAUTION: this is nonstandard - notation. An apostrophe on something means the derivative of the - objective function w.r.t. that thing]. - - Over this data, with these weights, we can compute the derivative - of the objective w.r.t. the mean and the scale: - - mean' = -scale * \sum_i w(i) y'(i) - scale' = \sum_i w(i) y'(i) (x(i) - mean) - = 1/scale \sum_i w(i) y'(i) y(i) - var' = -0.5 target_rms var^{-1.5} scale' - = -0.5 target_rms var^{-1.5} (1/scale) \sum_i w(i) y'(i) y(i) - .. and using 1/scale = var^{0.5}/target_rms, - = -0.5 var^{-1} \sum_i w(i) y'(i) y(i) (*) - - - It will be convenient to write down 'per-frame' versions of all of these - quantities, which are divided by the total count: - mean_norm' = mean' / count - scale_norm' = scale' / count - var_norm' = var' / count - (we keep the apostrophe on these quantities as it clarifies that they - are derivatives of the objective function w.r.t something). - - Now, 'var' can be written as: - var = epsilon + (1/count) \sum_i w(i) (x(i) - mean)^2 - and the following formula is more convenient to propagate the derivative - back to an x(i). - Note: the following has 3 terms, which we can think of as - "direct term" (given fixed mean and scale), - "term via mean" (term that comes via derivative of the mean) - "term via scale" (term that comes via derivative of the scale) - - - x'(i) = y'(i)*scale + mean_norm' + 2 var_norm' (x(i) - mean) - = y'(i)*scale + mean_norm' + 2 var_norm' y(i) / scale - ... and substituting in the equation (*) above for var', using var_norm' = var'/scale, - and rearranging slightly: - = y'(i)*scale + mean_norm' - y(i) * var^{-1}/scale * 1/count * \sum_i w(i) y'(i) y(i) - .. and using scale=target-rms * var^{-0.5}, so var^{-1}/scale = var^{-0.5}/target-rms = scale/target-rms^2: - = y'(i)*scale + mean_norm' - y(i) * scale/(count*target-rms^2) * \sum_i w(i) y'(i) y(i) - .. and considering that the factor of 'scale' appears (directly or indirectly) in all 3 - of the terms in the above expression, we can reorganize this as: - = scale * (y'(i) - 1/count*\sum_i w(i)*y(i) - 1/(count*target-rms^2) * \sum_i w(i) y'(i) y(i)) -*/ - - -void MemoryNormComponent::SetTestMode(bool test_mode) { - if (test_mode && stats_count_ <= 0) { - KALDI_WARN << "Refusing to set test-mode in MemoryNormComponent since no " - "stats are present."; - return; - } - test_mode_ = test_mode; -} - -void MemoryNormComponent::Check() const { - KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 && - epsilon_ > 0.0 && target_rms_ > 0.0 && - stats_count_ >= 0.0 && backward_count_ >= 0.0); - -} - -MemoryNormComponent::MemoryNormComponent(const MemoryNormComponent &other): - dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), - target_rms_(other.target_rms_), - include_indirect_derivative_(other.include_indirect_derivative_), - test_mode_(other.test_mode_), - stats_count_(other.stats_count_), backward_count_(other.backward_count_), - data_(other.data_) { - Check(); -} - - -std::string MemoryNormComponent::Info() const { - std::ostringstream stream; - stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ - << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ - << ", include-indirect-derivative=" - << (include_indirect_derivative_ ? "true" : "false") - << ", stats-count=" << stats_count_ << ", backward-count=" - << backward_count_ - << ", test-mode=" << (test_mode_ ? "true" : "false"); - if (stats_count_ > 0.0) { - CuSubVector x_mean(data_, 0), - y_deriv(data_, 2), y_deriv_y(data_, 3), - scale(data_, 4); - if (stats_count_ > 0.0) - stream << ", x-mean=" << SummarizeVector(x_mean) - << ", scale=" << SummarizeVector(scale); - if (backward_count_ > 0.0) - stream << ", y-deriv=" << SummarizeVector(y_deriv) - << ", y-deriv-y=" << SummarizeVector(y_deriv_y); - } - return stream.str(); -} - -void MemoryNormComponent::InitFromConfig(ConfigLine *cfl) { - dim_ = -1; - block_dim_ = -1; - epsilon_ = 1.0e-03; - target_rms_ = 1.0; - include_indirect_derivative_ = true; - test_mode_ = false; - - bool ok = cfl->GetValue("dim", &dim_); - cfl->GetValue("block-dim", &block_dim_); - cfl->GetValue("epsilon", &epsilon_); - cfl->GetValue("target-rms", &target_rms_); - cfl->GetValue("include-indirect-derivative", &include_indirect_derivative_); - cfl->GetValue("test-mode", &test_mode_); - if (!ok || dim_ <= 0) { - KALDI_ERR << "MemoryNormComponent must have 'dim' specified, and > 0"; - } - if (block_dim_ == -1) - block_dim_ = dim_; - if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 && - epsilon_ > 0 && target_rms_ > 0)) - KALDI_ERR << "Invalid configuration in MemoryNormComponent."; - if (cfl->HasUnusedValues()) - KALDI_ERR << "Could not process these elements in initializer: " - << cfl->UnusedValues(); - stats_count_ = 0.0; - backward_count_ = 0.0; - data_.Resize(5, block_dim_); -} - - - -void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const { - KALDI_ASSERT(SameDim(in, *out) && - (in.NumCols() == dim_ || in.NumCols() == block_dim_)); - if (in.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols()); - int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(), - orig_cols = in.NumCols(), new_rows = orig_rows * ratio, - new_cols = orig_cols / ratio; - CuSubMatrix in_reshaped(in.Data(), new_rows, new_cols, new_cols), - out_reshaped(out->Data(), new_rows, new_cols, new_cols); - return Propagate(indexes, in_reshaped, &out_reshaped); - } - - if (out->Data() != in.Data()) - out->CopyFromMat(in); - - if (test_mode_ && stats_count_ <= 0.0) - KALDI_ERR << "Test mode set but no stats available."; - - // From this point, we can assume that the num-cols of 'in' and 'out' - // equals block_dim_. - Memo *memo = NULL; - if (!test_mode_) { - memo = GetMemo(in); - } - - if (test_mode_) { - CuSubVector x_mean(data_, 0), scale(data_, 4); - out->AddVecToRows(-1.0, x_mean); - out->MulColsVec(scale); - } else { - CuSubVector x_mean(memo->data, 0), - scale(memo->data, 4); - out->AddVecToRows(-1.0, x_mean); - out->MulColsVec(scale); - } - return memo; -} - - -MemoryNormComponent::Memo* MemoryNormComponent::GetMemo( - const CuMatrixBase &in) const { - KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_ && - stats_count_ >= 0.0); - Memo *memo = new Memo; - BaseFloat old_stats_count = stats_count_, - num_frames = in.NumRows(), - new_stats_count = num_frames + old_stats_count, - old_weight = old_stats_count / new_stats_count; - - // The information in 'memo' will be copied to *this when - // StoreStats() is called (we can't update it in the Propagate() - // function for 'const' reasons). - memo->stats_count = new_stats_count; - memo->backward_count = backward_count_; - memo->data = data_; - - CuSubVector x_mean(memo->data, 0), - x_uvar(memo->data, 1), scale(memo->data, 4); - // Each row of 'in' gets a weight of 1.0 / new_stats_count in the stats. - x_mean.AddRowSumMat(1.0 / new_stats_count, in, old_weight); - x_uvar.AddDiagMat2(1.0 / new_stats_count, in, kTrans, old_weight); - - scale.CopyFromVec(x_uvar); - // we save a CUDA operation by applying the scale 'target_rms_scale' before doing - // ApplyPow(-0.5), and this requires taking it to the power -2. - BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_); - scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale); - // at this point, 'scale' is the variance. - scale.ApplyFloor(0.0); - scale.Add(epsilon_ * target_rms_scale); - scale.ApplyPow(-0.5); - // OK, now 'scale' is the scale. - return memo; -} - -void MemoryNormComponent::Backprop( - const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in_value, // unused. - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo_in, - Component *to_update_in, - CuMatrixBase *in_deriv) const { - - KALDI_ASSERT(SameDim(out_deriv, *in_deriv) && - (out_deriv.NumCols() == dim_ || - out_deriv.NumCols() == block_dim_)); - if (out_deriv.NumCols() != block_dim_) { - // if block_dim_ != dim_, we recurse; this helps keep the main code - // simple. - KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() && - in_deriv->Stride() == in_deriv->NumCols()); - if (out_value.NumRows() != 0) { - KALDI_ASSERT(out_value.Stride() == out_value.NumCols()); - } - int32 ratio = dim_ / block_dim_, - orig_rows = out_value.NumRows(), - orig_cols = out_value.NumCols(), - new_rows = orig_rows * ratio, new_cols = orig_cols / ratio; - CuSubMatrix - out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols), - in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols); - - // we'll never use in_value, so pass it in unchanged. - if (out_value.NumRows() != 0) { - CuSubMatrix out_value_reshaped(out_value.Data(), new_rows, - new_cols, new_cols); - Backprop(debug_info, indexes, in_value, - out_value_reshaped, out_deriv_reshaped, - memo_in, to_update_in, &in_deriv_reshaped); - } else { - Backprop(debug_info, indexes, in_value, - out_value, out_deriv_reshaped, - memo_in, to_update_in, &in_deriv_reshaped); - } - return; - } - - // assume in_deriv is non-NULL, because a non-updatable Component will not - // have the backprop called if the in_deriv is non-NULL. - - if (test_mode_) { - // In test mode we treat it as a fixed scale and offset. - KALDI_ASSERT(memo_in == NULL && stats_count_ != 0.0); - // the following is a no-op if in_deriv and out_deriv are the same matrix. - in_deriv->CopyFromMat(out_deriv); - CuSubVector scale(data_, 4); - in_deriv->MulColsVec(scale); - return; - } - - // OK, we're not in test mode. - // Before computing 'in_deriv', we may need to store some stats. - if (include_indirect_derivative_ && to_update_in != NULL) { - // Store some stats which are necessary to compute the 'indirect derivative' - // term (this is analogous to the part of the derivative in regular backprop - // that comes from the objf derivative w.r.t. the mean and variance stats). - // - // Note: instead of simply adding to the stats 'y_deriv' and 'y_deriv_y', - // the following equations do a kind of weighted combination, because - // these stats are stored normalized by the total count (backward_count_). - MemoryNormComponent *to_update = - dynamic_cast(to_update_in); - BaseFloat backward_count = to_update->backward_count_, - num_frames = in_deriv->NumRows(), - new_backward_count = backward_count + num_frames, - old_weight = backward_count / new_backward_count; - CuSubVector y_deriv(to_update->data_, 2), - y_deriv_y(to_update->data_, 3); - // The factor 1.0 / new_backward_count that appears below can be perhaps more - // clearly written as follows: first define - // new_weight = num_frames / new_backward_count - // and then write new_weight / num_frames, which simplifies to - // 1.0 / new_backward_count. The factor of 1.0 / num_frames is necessary to - // convert from data sums to a per-frame average. - y_deriv.AddRowSumMat(1.0 / new_backward_count, out_deriv, old_weight); - y_deriv_y.AddDiagMatMat(1.0 / new_backward_count, out_deriv, kTrans, - out_value, kNoTrans, old_weight); - to_update->backward_count_ = new_backward_count; - - // Now 'to_update' will typically be the same as 'this', so we need - // to compute the derived parameters because it affects some code that's - // below. - to_update->ComputeDerived(); - } - - // the following does no work if in_deriv and out_deriv are the same matrix. - in_deriv->CopyFromMat(out_deriv); - - if (this->backward_count_ != 0.0) { - CuSubVector y_deriv(data_, 2), - y_deriv_y(data_, 3); - in_deriv->AddVecToRows(-1.0, y_deriv); - in_deriv->AddMatDiagVec(-1.0 / (target_rms_ * target_rms_), - out_value, kNoTrans, y_deriv_y); - } - CuSubVector scale(data_, 4); - in_deriv->MulColsVec(scale); -} - - -void MemoryNormComponent::ComputeDerived() { - KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 5); - if (stats_count_ == 0.0) { - // zero 'scale'. - data_.Row(4).SetZero(); - return; - } - CuSubVector x_mean(data_, 0), x_uvar(data_, 1), - scale(data_, 4); - scale.CopyFromVec(x_uvar); - // we save a CUDA operation by applying the scale 'target_rms_scale' before doing - // ApplyPow(-0.5), and this requires taking it to the power -2. - BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_); - scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale); - // at this point, 'scale' is the variance (divided by target_rms^2). - scale.ApplyFloor(0.0); - scale.Add(epsilon_ * target_rms_scale); - scale.ApplyPow(-0.5); -} - -void MemoryNormComponent::StoreStats( - const CuMatrixBase &, // in_value - const CuMatrixBase &, // out_value - void *memo_in) { - // in test mode this component does not store stats; it doesn't provide the - // kStoresStats flag so this function won't be called. - KALDI_ASSERT(!test_mode_ && memo_in != NULL && stats_count_ >= 0.0); - - // We don't actually need 'in_value' and 'out_value', as the - // required statistics are already stored in 'memo_in'. - Memo *memo = static_cast(memo_in); - - // check that the memo's stats count is more than our stats_count_, - // which it should be because the memo should have added extra stats, - // and StoreStats() should be called directly after the Propagate() - // function. - // This could possibly fail with memo_in->stats_count == stats_count_ - // due to roundoff, if you trained with batchnorm-stats-scale set at 1, - // but that would be a poor choice of parameters anyway as - // roundoff would be a big problem. - KALDI_ASSERT(memo->stats_count > stats_count_); - - stats_count_ = memo->stats_count; - // Copying the entire data matrix should be safe because - // StoreStats() is always called directly after the corresponding - // Propagate(), and on the same object; and there should be - // no possibility that other things in this->data_ changed in - // the interim. - data_.CopyFromMat(memo->data); -} - -void MemoryNormComponent::Read(std::istream &is, bool binary) { - ExpectOneOrTwoTokens(is, binary, "", ""); - ReadBasicType(is, binary, &dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &block_dim_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &epsilon_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &target_rms_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &include_indirect_derivative_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &test_mode_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &stats_count_); - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &backward_count_); - ExpectToken(is, binary, ""); - data_.Read(is, binary); - ExpectToken(is, binary, ""); - Check(); -} - -void MemoryNormComponent::Write(std::ostream &os, bool binary) const { - Check(); - WriteToken(os, binary, ""); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, block_dim_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, epsilon_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, target_rms_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, include_indirect_derivative_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, test_mode_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, stats_count_); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, backward_count_); - WriteToken(os, binary, ""); - data_.Write(os, binary); - WriteToken(os, binary, ""); -} - -void MemoryNormComponent::Scale(BaseFloat scale) { - if (scale <= 0) { - if (scale < 0.0) - KALDI_WARN << "Setting stats to zero in MemoryNormComponent: requested scale = " - << scale; - // If scale is negative we zero the stats. This may not always be the right - // thing to do, so we warn. - data_.SetZero(); - stats_count_ = 0.0; - backward_count_ = 0.0; - } else { - stats_count_ *= scale; - backward_count_ *= scale; - // 'data_' doesnt need to be changed, as all the quantities it contains are - // normalized by the count. - } -} - - -void MemoryNormComponent::Add(BaseFloat alpha, const Component &other_in) { - const MemoryNormComponent *other = - dynamic_cast(&other_in); - - static bool warned = false; - if (alpha < 0.0) { - if (!warned) { - warned = true; - KALDI_WARN << "Adding MemoryNormComponent with negative scale: will do nothing " - << "(will not warn again)."; - } - return; - } - - if (alpha * other->stats_count_ == 0.0 && - alpha * other->backward_count_ == 0.0) - return; - - BaseFloat - new_stats_count = stats_count_ + alpha * other->stats_count_, - new_backward_count = backward_count_ + alpha * other->backward_count_; - - if (new_stats_count > 0.0) { - // This block sets rows 0 and 1 of data_, which we call 'x_mean' and - // 'x_uvar, to the appropriate weighted combination of 'this' and 'other'. - BaseFloat this_scale = stats_count_ / new_stats_count, - other_scale = alpha * other->stats_count_ / new_stats_count; - data_.RowRange(0, 2).Scale(this_scale); - data_.RowRange(0, 2).AddMat(other_scale, other->data_.RowRange(0, 2)); - } - if (new_backward_count > 0.0) { - // This block sets rows 2 and 3 of data_, which we call 'y_deriv' and - // 'y_deriv_y', to the appropriate weighted combination of 'this' and - // 'other'. - BaseFloat this_scale = backward_count_ / new_backward_count, - other_scale = alpha * other->backward_count_ / new_backward_count; - data_.RowRange(2, 2).Scale(this_scale); - data_.RowRange(2, 2).AddMat(other_scale, other->data_.RowRange(2, 2)); - } - stats_count_ = new_stats_count; - backward_count_ = new_backward_count; - ComputeDerived(); -} - -void MemoryNormComponent::ZeroStats() { - // We only zero the stats if we're not in test mode. In test mode, this would - // be dangerous as the stats aren't really considered to be stats, they become - // a fixed part of the model. - if (!test_mode_) { - stats_count_ = 0.0; - backward_count_ = 0.0; - data_.SetZero(); - } -} - - - - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index 84b5dbd817a..b10c3e4a60c 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -37,17 +37,19 @@ namespace nnet3 { /// @file nnet-normalize-component.h /// /// This file contains declarations of components that in one way or -/// another normalize their input: NormalizeComponent, BatchNormComponent, -/// and MemoryNormComponent. +/// another normalize their input: NormalizeComponent and BatchNormComponent. /* - Implements the function: + NormalizeComponent implements the function: y = x * (sqrt(dim(x)) * target-rms) / |x| - where |x| is the 2-norm of the vector x. I.e. its output is its input - scaled such that the root-mean-square values of its elements equals - target-rms. (As a special case, if the input is zero, it outputs zero). + where |x| is the 2-norm of the vector x. I.e. its output is its input + scaled such that the root-mean-square values of its elements equals + target-rms. (As a special case, if the input is zero, it outputs zero). + This is like Hinton's layer-norm, except not normalizing the mean, only + the variance. + Note: if you specify add-log-stddev=true, it adds an extra element to y which equals log(|x| / sqrt(dim(x))). @@ -299,237 +301,6 @@ class BatchNormComponent: public Component { }; -/* - MemoryNormComponent is like batch normalization, except the stats - are accumulated as a weighted sum over past minibatches (if this is - not the first minibatch), instead of over the current minibatch. - Caution: we don't test this component in the standard way because it - would fail the derivative tests. - - You can use it in the same way you would normally use BatchNormComponent. - - Accepted configuration values: - dim Dimension of the input and output - block-dim Defaults to 'dim', but may be set to a nonzero divisor - of 'dim'. In this case, each block of dimension 'block-dim' - is treated like a separate row of the input matrix, which - means that the stats from n'th element of each - block are pooled into one class, for each n.a - power Power that determines the scale we apply, as a function of - the variance. The default, -0.5, corresponds to regular - BatchNorm. - epsilon Small term added to the variance that is used to prevent - division by zero - target-rms This defaults to 1.0, but if set, for instance, to 2.0, - it will normalize the standard deviation of the output to - 2.0. 'target-stddev' might be a more suitable name, but this - was chosen for consistency with NormalizeComponent. - include-indirect-derivative This defaults to true, which means we - include the (smaller) derivative term that comes via the - mean and variance estimation. You might want to set this to - false for testing purposes. - */ -class MemoryNormComponent: public Component { - public: - - MemoryNormComponent() { } - - // constructor using another component - MemoryNormComponent(const MemoryNormComponent &other); - - virtual int32 InputDim() const { return dim_; } - virtual int32 OutputDim() const { return dim_; } - - virtual std::string Info() const; - virtual void InitFromConfig(ConfigLine *cfl); - virtual std::string Type() const { return "MemoryNormComponent"; } - virtual int32 Properties() const { - // If the block-dim is less than the dim, we need the input and output - // matrices to be contiguous (stride==num-cols), as we'll be reshaping - // internally. This is not much of a cost, because this will be used - // in convnets where we have to do this anyway. - bool iid = include_indirect_derivative_; - return kSimpleComponent|kPropagateInPlace|kBackpropInPlace| - (test_mode_ ? 0 : kUsesMemo|kStoresStats|(iid?kBackpropNeedsOutput:0))| - (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0); - - } - - // Call this function to set 'test mode' to true or false. In test - // mode the stats are frozen and will not be updated. - void SetTestMode(bool test_mode); - - - virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &in, - CuMatrixBase *out) const; - - /// The backprop function. In addition to propagating the input back to - /// 'in_deriv', if supplied, this function also updates, in 'to_update', - /// backward_count_ and the rows named 'y_deriv' and 'y_deriv_y' of - /// data_, and also the derived quantities 'x_deriv' and 'scale_deriv' - /// of data_. - /// (note: in training, 'to_update' will point to delta_nnet_, and later these - /// stats get added to nnet_ via Add()) - virtual void Backprop(const std::string &debug_info, - const ComponentPrecomputedIndexes *indexes, - const CuMatrixBase &, // in_value - const CuMatrixBase &out_value, - const CuMatrixBase &out_deriv, - void *memo, - Component *to_update, - CuMatrixBase *in_deriv) const; - - virtual void Read(std::istream &is, bool binary); // This Read function - // requires that the Component has the correct type. - - /// Write component to stream - virtual void Write(std::ostream &os, bool binary) const; - virtual Component* Copy() const { return new MemoryNormComponent(*this); } - - // Note: if you scale by a negative number it will set stats to zero - // rather than allow a negative stats count. - virtual void Scale(BaseFloat scale); - // Note: if you try to add with negative coefficient (as in backstitch), it - // will do nothing. - virtual void Add(BaseFloat alpha, const Component &other); - virtual void ZeroStats(); - - virtual void DeleteMemo(void *memo) const { delete static_cast(memo); } - - /// This function updates stats_count_, the rows named 'x_mean', 'x_uvar' - /// of data_, and also the derived quantities stored in the rows named - /// 'scale', 'x_deriv' and 'scale_deriv' of data_. - /// (note: in training, this is called on the delta_nnet_, and later - /// the stats get added to nnet_ via Add()) - virtual void StoreStats(const CuMatrixBase &, // in_value - const CuMatrixBase &, // out_value - void *memo); - - private: - - struct Memo { - // 'stats_count' is the same as stats_count_ in the MemoryNormComponent - // from whose Propagate() function this memo was generated, plus - // the number of frames we're propagating (this is after any reshaping - // if block_dim_ != dim_). - BaseFloat stats_count; - - // 'stats_count' is the same as stats_count_ in the MemoryNormComponent - // from whose Propagate() function this memo was generated. It's mainly - // included because the backprop code wants to see if this was nonzero. - BaseFloat backward_count; - - // The structure of 'data' is the same as the data_ member of - // MemoryNormComponent; it's a matrix of dimension 5 by block_dim_. - // It will differ from the data_ member of the component we generated this - // from by the addition of some extra data in the 'x_sum' and 'x_sumsq' - // stats, and a corresponding modification of the 'scale', 'x_deriv' - // and 'scale_deriv' quantities. - // - // (note: the reason we update the stats before propagation rather - // than after, is for stability: otherwise, with relu units, if we only - // update the stats after the propagation we get a particular pathology: if - // a unit was previously always zero it will get a big scale; and if then we - // start getting some nonzero output, the scale on it will be too large.) - CuMatrix data; - }; - - - /// This piece of code, which has been broken out from Propagate(), computes - /// the memo. Expects in.NumCols() == block_dim_. It should only be called - /// if test_mode_ is false. - Memo *GetMemo(const CuMatrixBase &in) const; - - /// This function computes certain members of data_ that are derived: - /// specifically, rows 4, 5 and 6, which are called 'scale', 'x_deriv' and - /// 'scale_deriv'. - void ComputeDerived(); - - void Check() const; - - // this function is used in a couple of places; it turns the raw stats into - // the offset/scale term of a normalizing transform. - static void ComputeOffsetAndScale(BaseFloat count, - BaseFloat epsilon, - const Vector &stats_sum, - const Vector &stats_sumsq, - Vector *offset, - Vector *scale); - - // Dimension of the input and output. - int32 dim_; - - // block_dim_ would normally be the same as dim_, but if it's less (and it - // must be > 0 and must divide dim_), then each separate block of the input of - // dimension 'block_dim_' is treated like a separate frame for the purposes of - // normalization. This can be used to implement spatial batch normalization - // for convolutional setups-- assuming the filter-dim has stride 1, which it - // always will in the new code in nnet-convolutional-component.h. - int32 block_dim_; - - // Used to avoid exact-zero variances, epsilon has the dimension of a - // covariance. - BaseFloat epsilon_; - - // This controls the dynamic range of the output. At 1.0 which is the - // default, the output has unit standard deviation, but you can set it to - // other values. The same config exists in NormalizeComponent. - BaseFloat target_rms_; - - // If true, we include the smaller indirect part of the derivative, that comes - // via the stats estimation. This is included mostly for testing purposes; we - // expect this will normally be true. - bool include_indirect_derivative_; - - // If test_mode_ is set, no stats will be accumulated. It's an error if - // test_mode_ is set and the data count is zero, and you try to propagate. - bool test_mode_; - - // The total count of stats stored by StoreStats(), and which are represented - // in x_mean = data_.Row(0) and x_uvar = data_.Row(1). We never allow this to - // become less than zero, even if people do unexpected things with Add() and - // Scale(). - BaseFloat stats_count_; - - // backward_count_ is the total count of stats accumulated during backprop, - // and represents the count correspondsing to the stats in 'y_deriv' and - // 'y_deriv_y'. It is expected to be either zero or the same as stats_count_, - // in most circumstances, depending whether you were doing backprop or just - // inference-- but we don't enforce this because there may be situations where - // this is not the case. - // - // We never allow this to become less than zero, even if people do unexpected - // things with Add() and Scale(). - BaseFloat backward_count_; - - // We store data_ as a single matrix because it enables certain operations - // to be done using fewer kernels, but it contains various different quantities, - // which we'll describe below as if they were separate variables. - // data_ is of dimension 5 by block_dim_. - CuMatrix data_; - // data_.Row(0) is 'x_mean', which is the decaying moving-average of - // input data x; or zero if stats_count_ is zero. - // data_.Row(1) is 'x_uvar', which is the decaying moving-average of - // input data x^2 or zero if stats_count_ is zero. - // data_.Row(2) is 'y_deriv', which is the decaying moving-average - // derivative of the objective w.r.t. the output y; or - // zero if backward_count_ is zero. - // data_.Row(3) is 'y_deriv_y', which the decaying moving average - // of the product of the output times (the derivative of the - // objective w.r.t. the output); or zero if backward_count_ - // is zero. - // - // The quantity below is derived from the stats above. - // - // data_.Row(4) is 'scale', which is the inverse square root of the - // covariance computed from x_mean and x_uvar (plus epsilon), - // or zero if stats_count_ is zero. -}; - - - - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index da14c188244..7d84c2e9518 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -4981,12 +4981,6 @@ void CompositeComponent::Init(const std::vector &components, max_rows_process_ = max_rows_process; for (size_t i = 0; i < components_.size(); i++) { - if (components_[i]->Type() == "MemoryNormComponent") { - // This is out of concerns about the fact that the stats accumulation - // is done in the backprop, not in the forward propagation. - KALDI_ERR << "MemoryNormComponent cannot currently exist inside " - "CompositeComponent"; - } // make sure all constituent components are simple. KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent); if (i > 0) { diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 781b4e558e9..472a02197e5 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1675,11 +1675,11 @@ static void GenerateRandomComponentConfig(std::string *component_type, // labels to the most recently added component, so it gets tested more case 31: { *component_type = "BatchNormComponent"; - int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2); + int32 block_dim = RandInt(1, 20), dim = block_dim * RandInt(1, 2); bool test_mode = (RandInt(0, 1) == 0); os << " dim=" << dim << " block-dim=" << block_dim << " target-rms=" - << RandInt(1, 2) << " test-mode=" + << RandInt(1, 4) << " test-mode=" << (test_mode ? "true" : "false") << " power=" << (-0.1 * RandInt(3, 5)) << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0"); diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 2a92e1f5a44..488a711e09d 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -492,8 +492,7 @@ void SetDropoutProportion(BaseFloat dropout_proportion, bool HasBatchnorm(const Nnet &nnet) { for (int32 c = 0; c < nnet.NumComponents(); c++) { const Component *comp = nnet.GetComponent(c); - if (dynamic_cast(comp) != NULL || - dynamic_cast(comp) != NULL) + if (dynamic_cast(comp) != NULL) return true; } return false; @@ -509,9 +508,6 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, BatchNormComponent *bc = dynamic_cast(comp); if (bc != NULL) bc->Scale(batchnorm_stats_scale); - MemoryNormComponent *mc = dynamic_cast(comp); - if (mc != NULL) - mc->Scale(batchnorm_stats_scale); } } @@ -536,9 +532,6 @@ void SetBatchnormTestMode(bool test_mode, Nnet *nnet) { BatchNormComponent *bc = dynamic_cast(comp); if (bc != NULL) bc->SetTestMode(test_mode); - MemoryNormComponent *mc = dynamic_cast(comp); - if (mc != NULL) - mc->SetTestMode(test_mode); } } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 6f9b6cb959f..fc1631a8d77 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -168,12 +168,10 @@ std::string NnetInfo(const Nnet &nnet); void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet); -/// Returns true if nnet has at least one component of type -/// BatchNormComponent or MemoryNormComponent +/// Returns true if nnet has at least one component of type BatchNormComponent. bool HasBatchnorm(const Nnet &nnet); -/// This function affects only components of type BatchNormComponent or -/// MemoryNormComponent. +/// This function affects only components of type BatchNormComponent. /// It sets "test mode" on such components (if you call it with test_mode = /// true, otherwise it would set normal mode, but this wouldn't be needed /// often). "test mode" means that instead of using statistics from the batch, @@ -445,8 +443,8 @@ void ApplyL2Regularization(const Nnet &nnet, /** This function scales the batchorm stats of any batchnorm components - (components of type BatchNormComponent or MemoryNormComponent) in 'nnet' by - the scale 'batchnorm_stats_scale'. + (components of type BatchNormComponent) in 'nnet' by the scale + 'batchnorm_stats_scale'. */ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, Nnet *nnet); From f654281ef57952b1a37a9fc48ce45ef7ac7e6cec Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 29 Dec 2017 20:44:32 -0800 Subject: [PATCH 040/184] [src,scripts] Removing diagonal extension to natural gradient code (did not seem helpful) --- .../steps/libs/nnet3/xconfig/basic_layers.py | 5 +- src/nnet3/natural-gradient-online.cc | 137 +----------------- src/nnet3/natural-gradient-online.h | 112 -------------- src/nnet3/nnet-simple-component.cc | 61 +------- src/nnet3/nnet-simple-component.h | 25 ---- 5 files changed, 9 insertions(+), 331 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 6393709d82e..c59e4a6041e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -686,8 +686,6 @@ def set_default_configs(self): 'bias-stddev': '', 'l2-regularize': '', 'learning-rate-factor': '', - 'diagonal-power-in': '', - 'diagonal-power-out': '', 'max-change': 0.75 } def check_configs(self): @@ -760,8 +758,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities): affine_options = self.config['ng-affine-options'] for opt_name in [ 'max-change', 'learning-rate-factor', - 'bias-stddev', 'l2-regularize', - 'diagonal-power-in', 'diagonal-power-out' ]: + 'bias-stddev', 'l2-regularize' ]: value = self.config[opt_name] if value != '': affine_options += ' {0}={1}'.format(opt_name, value) diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index 83702626f5f..19a7d5fafdc 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -28,8 +28,7 @@ OnlineNaturalGradient::OnlineNaturalGradient(): rank_(40), update_period_(1), num_samples_history_(2000.0), num_minibatches_history_(0.0), alpha_(4.0), epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0), - self_debug_(false), - diagonal_power_(0.0), diagonal_epsilon_(1.0e-03) { } + self_debug_(false) { } /** @@ -182,14 +181,10 @@ void OnlineNaturalGradient::PreconditionDirections( bool updating = Updating(); BaseFloat initial_product; - if (diagonal_power_ == 0.0 || scale != NULL) - initial_product = TraceMatMat(*X_t, *X_t, kTrans); + initial_product = TraceMatMat(*X_t, *X_t, kTrans); - if (diagonal_power_ == 0.0) - PreconditionDirectionsInternal(rho_t, initial_product, - updating, d_t, &WJKL_t, X_t); - else - PreconditionDirectionsDiagonal(rho_t, updating, d_t, &WJKL_t, X_t); + PreconditionDirectionsInternal(rho_t, initial_product, + updating, d_t, &WJKL_t, X_t); if (scale) { if (initial_product <= 0.0) { @@ -202,123 +197,6 @@ void OnlineNaturalGradient::PreconditionDirections( t_ += 1; } -void OnlineNaturalGradient::PreconditionDirectionsDiagonal( - const BaseFloat rho_t, - bool updating, - const Vector &d_t, - CuMatrixBase *WJKL_t, - CuMatrixBase *X_t) { - KALDI_ASSERT(diagonal_power_ > 0.0 && diagonal_power_ <= 1.0 && - (diagonal_mean_.Dim() != 0 || updating)); - - int32 dim = X_t->NumCols(); - - if (diagonal_mean_.Dim() == 0) { - InitDiagonalParams(*X_t); - updating = false; - } - - CuVector new_diagonal_mean, new_diagonal_uvar; - - if (updating) { - new_diagonal_mean.Resize(dim, kUndefined); - new_diagonal_uvar.Resize(dim, kUndefined); - UpdateDiagonalStats(*X_t, &new_diagonal_mean, &new_diagonal_uvar); - } - - X_t->MulColsVec(diagonal_scale_); - - PreconditionDirectionsInternal(rho_t, TraceMatMat(*X_t, *X_t, kTrans), false, - d_t, WJKL_t, X_t); - - // We apply the scale both before and after the identity-plus-low-rank matrix, - // so that the combined matrix is symmetric. - X_t->MulColsVec(diagonal_scale_); - - - // If we're updating the diagonal mean and variance we do so *after* - // preconditioning the data. This is out of a concern about the provability - // of convergence (making it independent of the current minibatch). Most - // likely, in practice it would work fine updating it before, it might even be - // a little bit more stable. Anyway, this is how we're doing it, and it's how - // we did it for the core part of the natural gradient. - if (updating) { - diagonal_mean_.Swap(&new_diagonal_mean); - diagonal_uvar_.Swap(&new_diagonal_uvar); - UpdateDiagonalScale(); - } -} - -void OnlineNaturalGradient::UpdateDiagonalStats( - const CuMatrixBase &X, - CuVectorBase *diagonal_mean_new, - CuVectorBase *diagonal_uvar_new){ - int32 dim = X.NumCols(), num_rows = X.NumRows(); - KALDI_ASSERT(diagonal_mean_new->Dim() == dim && diagonal_uvar_new->Dim() == dim && - diagonal_mean_.Dim() == dim); - BaseFloat eta = Eta(X.NumRows()); - // 'eta' is a value that reflects how fast we update these stats, which is - // smaller if we're updating them slower, but strictly less than 1. It's - // basically the scale on the new stats, with 1-eta being the scale on the old - // stats. - KALDI_ASSERT(eta > 0 && eta < 1.0); - - diagonal_mean_new->CopyFromVec(diagonal_mean_); - diagonal_uvar_new->CopyFromVec(diagonal_uvar_); - - diagonal_mean_new->AddRowSumMat(eta / num_rows, X, 1.0 - eta); - diagonal_uvar_new->AddDiagMat2(eta / num_rows, X, kTrans, 1.0 - eta); -} - -void OnlineNaturalGradient::InitDiagonalParams( - const CuMatrixBase &X) { - int32 dim = X.NumCols(), num_rows = X.NumRows(); - diagonal_mean_.Resize(dim); - diagonal_uvar_.Resize(dim); - diagonal_mean_.AddRowSumMat(1.0 / num_rows, X, 0.0); - diagonal_uvar_.AddDiagMat2(1.0 / num_rows, X, kTrans, 0.0); - UpdateDiagonalScale(); -} - - -void OnlineNaturalGradient::UpdateDiagonalScale() { - KALDI_ASSERT(diagonal_mean_.Dim() != 0); - int32 dim = diagonal_mean_.Dim(); - if (diagonal_scale_.Dim() != dim) - diagonal_scale_.Resize(dim); - diagonal_scale_.CopyFromVec(diagonal_uvar_); - // Because the last element may be the offset, and it doesn't - // make sense to subtract its mean for this purpose, only do this for - // the all-but-last-elements. - if (dim > 1) { - CuSubVector diagonal_scale_part(diagonal_scale_, 0, dim - 1), - diagonal_mean_part(diagonal_mean_, 0, dim - 1); - diagonal_scale_part.AddVecVec(-1.0, diagonal_mean_part, - diagonal_mean_part, 1.0); - } - - // At this point, diagonal_scale_ is the diagonal of the (centered) variance - // estimated from the x and x2 statistics, prior to any flooring or - // scaling. - BaseFloat avg_variance = diagonal_scale_.Sum() / dim; - if (avg_variance <= 1.0e-20) { - // either the data is all zero or very tiny, or something went wrong. Just - // set diagonal_scale_ to a constant. - diagonal_scale_.Set(1.0); - } else { - BaseFloat floor = diagonal_epsilon_ * avg_variance; - diagonal_scale_.ApplyFloor(floor); - // The following statement scales diagonal_scale_ so its average is close to - // 1, which helps keep things in a reasonable numeric range. There is no - // reason why it has to be exactly one, and the whole thing is mathematically - // invariant to this scaling factor-- we output the scaling factor 'scale' - // from PreconditionDirections() so that the user can rescale so the vector - // 2-norm of the X_t matrix is the same as was before the natural gradent. - diagonal_scale_.Scale(1.0 / avg_variance); - diagonal_scale_.ApplyPow(-0.5 * diagonal_power_); - } -} - void OnlineNaturalGradient::ReorthogonalizeXt1( const VectorBase &d_t1, BaseFloat rho_t1, @@ -701,12 +579,7 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other) alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_), frozen_(other.frozen_), t_(other.t_), self_debug_(other.self_debug_), W_t_(other.W_t_), - rho_t_(other.rho_t_), d_t_(other.d_t_), - diagonal_power_(other.diagonal_power_), - diagonal_epsilon_(other.diagonal_epsilon_), - diagonal_mean_(other.diagonal_mean_), - diagonal_uvar_(other.diagonal_uvar_), - diagonal_scale_(other.diagonal_scale_) { } + rho_t_(other.rho_t_), d_t_(other.d_t_) { } OnlineNaturalGradient& OnlineNaturalGradient::operator = ( diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index f2713063492..0b05948977e 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -411,43 +411,6 @@ namespace nnet3 { is that this isn't going to be a problem. */ -/** - DIAGONAL_EXTENSION - - This comment explains the diagonal extension to the natural gradient method (this - was not described in the original paper). - - Physically this diagonal scaling happens both before and after the main natural - gradient code. I.e. the main natural gradient code (which makes use - of a scaled-unit-plus-low-rank factorization), happens inside the - space where we've applied the diagonal component of the preconditioning, - so the overall natural-gradient matrix is of the form: - diag scaled-unit-plus-low-rank diag. - The way this is estimated only really makes sense if diagonal_power_ - is either zero or one, but I expect that for in-between values it will - work fine in practice. - - The way the diagonal scaling factor is estimated is that we accumulate mean - and variance stats for each dimension (decaying over time like the previous - natural gradient stats), and set the scaling factor to some power of the - variance estimated this way. The power of the variance used to get the - scaling factor is actually -0.5 times diagonal_power_, the factor of 0.5 - being required because the scaling is applied twice, both before and after - the scaled-unit-plus-low-rank inverse-Fisher matrix, to preserve symmetry. - - It may seem odd that we are taking into account the mean here, while - conceptually it's the uncentered covariance of the vectors that we're - modeling. The reason is that any offset in the vectors we're modeling - can be taken into account by one of the eigenvectors of the low-rank - matrix, so we anticipate that taking the mean out of consideration will - tend to give us a better factorization. This is all a litte bit ad-hoc. - It would be cleaner to formulate this whole thing as learning a factored - representation of the inverse Fisher matrix, but that would become - very complicated, so we just estimate the diagonal in this rather ad-hoc - way and then do the low-rank factorization of the Fisher matrix after - the diagonal preconditioning. - */ - class OnlineNaturalGradient { public: @@ -471,11 +434,6 @@ class OnlineNaturalGradient { int32 GetRank() const { return rank_; } int32 GetUpdatePeriod() const { return update_period_; } - // search above for DIAGONAL_EXTENSION for explanations. Value should - // be between 0 and 1. - void SetDiagonalPower(BaseFloat p) { diagonal_power_ = p; } - BaseFloat GetDiagonalPower() const { return diagonal_power_; } - // see comment where 'frozen_' is declared. inline void Freeze(bool frozen) { frozen_ = frozen; } @@ -521,16 +479,6 @@ class OnlineNaturalGradient { CuMatrixBase *WJKL_t, CuMatrixBase *X_t); - // This function is called from PreconditionDirections(), only if - // diagonal_power_ != 0.0 (see comment starting DIAGONAL_EXTENSION above). - // It takes care of the diagonal factors in the Fisher-matrix estimate - // and recurses to PreconditionDirectionsInternal(). - void PreconditionDirectionsDiagonal(const BaseFloat rho_t, - bool updating, - const Vector &d_t, - CuMatrixBase *WJKL_t, - CuMatrixBase *X_t); - // Works out from t_ and various class variables whether we will update // the parameters on this iteration (returns true if so). @@ -597,29 +545,6 @@ class OnlineNaturalGradient { // properties. void SelfTest() const; - - // This function, called only if diagonal_power_ != 0.0 (see - // DIAGONAL_EXTENSION comment), initializes diagonal_mean_, diagonal_uvar_ and - // diagonal_scale_, with stats from this minibatch (X is the vectors before - // preconditioning, one vector per row). - void InitDiagonalParams(const CuMatrixBase &X); - - // This function, called only if diagonal_power_ != 0.0 (see - // DIAGONAL_EXTENSION comment), sets diagonal_mean_new and diagonal_uvar_new to - // updated versions of the diagonal stats in diagonal_mean_ and diagonal_uvar_: - // changed by scaling down the old stats and then adding in stats from 'X'. - // 'X' is the vectors (one per row) that are doing to multiplied by our - // natural gradient matrix. The provided pointers will be pointers to - // temporaries that will later be copied to class members. - void UpdateDiagonalStats(const CuMatrixBase &X, - CuVectorBase *diagonal_mean_new, - CuVectorBase *diagonal_uvar_new); - - // This function updates diagonal_scale_ from the stats in - // diagonal_mean_ and diagonal_uvar_. - void UpdateDiagonalScale(); - - // Configuration values: // The rank of the correction to the unit matrix (e.g. 20). @@ -685,43 +610,6 @@ class OnlineNaturalGradient { CuMatrix W_t_; BaseFloat rho_t_; Vector d_t_; - - // Things below this point relate to 'diagonal' preconditioning. - // Search above for DIAGONAL_EXTENSION for an in-depth explanation. - - // The diagonal extension is turned off by default (diagonal_power_ == 0.0), - // but you can turn it on by setting diagonal_power_ (probably to some - // positive value not greater than 1, with 1 corresponding to natural - // gradient, and 0.5 corresponding to something more like Adagrad). - BaseFloat diagonal_power_; - - // diagonal_epsilon_ (e.g. 0.001) is a floor on the diagonal elements of the - // variances; this is expressed relative to the average un-floored variance - // over all dimensions (since dynamic ranges differ considerably). - BaseFloat diagonal_epsilon_; - - // dim_ is not a real variable but it is useful for explaining some things - // we're doing below. It's the dimension of the vectors we're preconditioning: - // D in the math and the paper. Is is the same as W_t_.NumCols(). - // int32 dim_; - - // diagonal_mean_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a - // moving-average mean of the vectors we're preconditioning. - CuVector diagonal_mean_; - - // diagonal_xuvar_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a - // decaying average over minibatches of the (diagonal) uncentered variance of - // the input vectors we're preconditioning. - CuVector diagonal_uvar_; - - // diagonal_scale_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a - // vector of scaling factors which is the diagonal part of the inverse-Fisher - // matrix, applied before and after the scaled-unit-plus-low-rank part. - // It is the (floored and rescaled) variance estimated from the stats in - // diagonal_mean_ and diagonal_uvar_, taken to the power -0.5 * diagonal_power_. - CuVector diagonal_scale_; - - }; } // namespace nnet3 diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 7d84c2e9518..34d24a39f24 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2657,14 +2657,6 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &rank_in); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_out); - if (PeekToken(is, binary) == 'D') { - ExpectToken(is, binary, ""); - BaseFloat d_in, d_out; - ReadBasicType(is, binary, &d_in); - ReadBasicType(is, binary, &d_out); - preconditioner_in_.SetDiagonalPower(d_in); - preconditioner_out_.SetDiagonalPower(d_out); - } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &update_period); ExpectToken(is, binary, ""); @@ -2773,9 +2765,7 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { // Set natural-gradient configs. BaseFloat num_samples_history = 2000.0, - alpha = 4.0, - diagonal_power_in = 0.0, - diagonal_power_out = 0.0; + alpha = 4.0; int32 rank_in = 20, rank_out = 80, update_period = 4; cfl->GetValue("num-samples-history", &num_samples_history); @@ -2783,8 +2773,6 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("rank-in", &rank_in); cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); - cfl->GetValue("diagonal-power-in", &diagonal_power_in); - cfl->GetValue("diagonal-power-out", &diagonal_power_out); preconditioner_in_.SetNumSamplesHistory(num_samples_history); preconditioner_out_.SetNumSamplesHistory(num_samples_history); @@ -2793,8 +2781,6 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { preconditioner_in_.SetRank(rank_in); preconditioner_out_.SetRank(rank_out); preconditioner_out_.SetUpdatePeriod(update_period); - preconditioner_in_.SetDiagonalPower(diagonal_power_in); - preconditioner_out_.SetDiagonalPower(diagonal_power_out); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " @@ -2814,13 +2800,6 @@ void NaturalGradientAffineComponent::Write(std::ostream &os, WriteBasicType(os, binary, preconditioner_in_.GetRank()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_out_.GetRank()); - BaseFloat d_in = preconditioner_in_.GetDiagonalPower(), - d_out = preconditioner_out_.GetDiagonalPower(); - if (d_in != 0.0 || d_out != 0.0) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, d_in); - WriteBasicType(os, binary, d_out); - } WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod()); WriteToken(os, binary, ""); @@ -2838,12 +2817,6 @@ std::string NaturalGradientAffineComponent::Info() const { << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() << ", update-period=" << preconditioner_in_.GetUpdatePeriod() << ", alpha=" << preconditioner_in_.GetAlpha(); - BaseFloat d_in = preconditioner_in_.GetDiagonalPower(), - d_out = preconditioner_out_.GetDiagonalPower(); - if (d_in != 0.0 || d_out != 0.0) { - stream << ", diagonal-power-in=" << d_in - << ", diagonal-power-out=" << d_out; - } return stream.str(); } @@ -2948,14 +2921,6 @@ void LinearComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_in); ReadBasicType(is, binary, &rank_out); - if (PeekToken(is, binary) == 'D') { - ExpectToken(is, binary, ""); - BaseFloat d_in, d_out; - ReadBasicType(is, binary, &d_in); - ReadBasicType(is, binary, &d_out); - preconditioner_in_.SetDiagonalPower(d_in); - preconditioner_out_.SetDiagonalPower(d_out); - } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &alpha); ExpectToken(is, binary, ""); @@ -3007,9 +2972,7 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { // Read various natural-gradient-related configs. int32 rank_in = 20, rank_out = 80, update_period = 4; BaseFloat alpha = 4.0, - num_samples_history = 2000.0, - diagonal_power_in = 0.0, - diagonal_power_out = 0.0; + num_samples_history = 2000.0; use_natural_gradient_ = true; @@ -3019,9 +2982,6 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); cfl->GetValue("use-natural-gradient", &use_natural_gradient_); - cfl->GetValue("diagonal-power-in", &diagonal_power_in); - cfl->GetValue("diagonal-power-out", &diagonal_power_out); - preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); @@ -3031,8 +2991,6 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); - preconditioner_in_.SetDiagonalPower(diagonal_power_in); - preconditioner_out_.SetDiagonalPower(diagonal_power_out); orthonormal_constraint_ = 0.0; cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); @@ -3059,17 +3017,10 @@ void LinearComponent::Write(std::ostream &os, rank_out = preconditioner_out_.GetRank(), update_period = preconditioner_in_.GetUpdatePeriod(); BaseFloat alpha = preconditioner_in_.GetAlpha(), - num_samples_history = preconditioner_in_.GetNumSamplesHistory(), - d_in = preconditioner_in_.GetDiagonalPower(), - d_out = preconditioner_out_.GetDiagonalPower(); + num_samples_history = preconditioner_in_.GetNumSamplesHistory(); WriteToken(os, binary, ""); WriteBasicType(os, binary, rank_in); WriteBasicType(os, binary, rank_out); - if (d_in != 0.0 || d_out != 0.0) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, d_in); - WriteBasicType(os, binary, d_out); - } WriteToken(os, binary, ""); WriteBasicType(os, binary, alpha); WriteToken(os, binary, ""); @@ -3089,12 +3040,6 @@ std::string LinearComponent::Info() const { GetVerboseLevel() >= 2); // include_singular_values if (orthonormal_constraint_ != 0.0) stream << ", orthonormal-constraint=" << orthonormal_constraint_; - BaseFloat d_in = preconditioner_in_.GetDiagonalPower(), - d_out = preconditioner_out_.GetDiagonalPower(); - if (d_in != 0.0 || d_out != 0.0) { - stream << ", diagonal-power-in=" << d_in - << ", diagonal-power-out=" << d_out; - } stream << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false") << ", rank-in=" << preconditioner_in_.GetRank() diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 2432c912e75..f596ec6be75 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -773,13 +773,6 @@ class LogSoftmaxComponent: public NonlinearComponent { minibatches) we update the Fisher-matrix estimates; making this > 1 saves a little time in training. default=4. - diagonal-power-in, diagonal-power-out - Control a diagonal factor in the natural gradient - factorization, for the input and output spaces - respectively 0.0 = default (old-style natural - gradient), 1.0 = natural gradient with the diagonal - factors; 0.5 is more like a factorized type of - adagrad. */ class NaturalGradientAffineComponent: public AffineComponent { public: @@ -805,17 +798,6 @@ class NaturalGradientAffineComponent: public AffineComponent { NaturalGradientAffineComponent &operator= ( const NaturalGradientAffineComponent&); - // Configs for preconditioner. The input side tends to be better conditioned -> - // smaller rank needed, so make them separately configurable. - int32 rank_in_; - int32 rank_out_; - int32 update_period_; - BaseFloat num_samples_history_; - BaseFloat alpha_; - // note: the config values diagonal-power-in and diagonal-power-out - // are stored in the objects preconditioner_in_ and preconditioner_out_ - // directly. - OnlineNaturalGradient preconditioner_in_; OnlineNaturalGradient preconditioner_out_; @@ -883,13 +865,6 @@ class NaturalGradientAffineComponent: public AffineComponent { minibatches) we update the Fisher-matrix estimates; making this > 1 saves a little time in training. default=4. - diagonal-power-in, diagonal-power-out - Control a diagonal factor in the natural gradient - factorization, for the input and output spaces - respectively 0.0 = default (old-style natural - gradient), 1.0 = natural gradient with the diagonal - factors; 0.5 is more like a factorized type of - adagrad. */ class LinearComponent: public UpdatableComponent { public: From 49689a6fbfc20053e861056191b5b44465250bc7 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 30 Dec 2017 21:13:25 -0800 Subject: [PATCH 041/184] [scripts] Clarify documentation; remove unused feature. --- .../libs/nnet3/train/chain_objf/acoustic_model.py | 9 +-------- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 798f6087a51..1ec9a09b571 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -173,19 +173,13 @@ def train_new_models(dir, iter, srand, num_jobs, (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "")) - # For the first epoch (at most the first 15 iters), scale the batchnorm stats - # down more aggressively. This affects memory-norm components. - batchnorm_opt=("--batchnorm-stats-scale=0.5" - if num_archives_processed < (num_archives * frame_subsampling_factor) and iter < 15 - else "") - thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ --apply-deriv-weights={app_deriv_wts} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ {cache_io_opts} --xent-regularize={xent_reg} \ - {deriv_time_opts} {batchnorm_opt} \ + {deriv_time_opts} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ @@ -205,7 +199,6 @@ def train_new_models(dir, iter, srand, num_jobs, dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1, job=job, deriv_time_opts=" ".join(deriv_time_opts), - batchnorm_opt=batchnorm_opt, app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index d226a891113..e66d38a3dc4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -778,10 +778,17 @@ def _generate_lstm_config(self): # This class is for lines like # 'lstmb-layer name=lstm1 input=[-1] delay=-3' # -# TODO: more description -# It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix -# of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide -# it into two matrices, with an orbatch-norm in between to stabilize the training. +# LSTMB is not something we've published; it's LSTM with a bottleneck in the +# middle of the W_all matrix (where W_all is a matrix that combines the 8 full +# matrices of standard LSTM). W_all is factored into W_all_a and W_all_b, where +# W_all_a is constrained to have orthonormal rows (this keeps it training stably). +# +# It also contains a couple of other improvements: W_all_b is followed by +# trainable ScaleAndOffsetComponent (this is a bit like the idea from the +# publication "Self-stabilized deep neural network" by Ghahramani et al). +# And the LSTM is followed by a batchnorm component (this is by default; it's not +# part of the layer name, like lstmb-batchnorm-layer). + # # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. From c362b2cf4bfbe51282645a472f04a76e8ee475f5 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 31 Dec 2017 19:35:15 -0800 Subject: [PATCH 042/184] [src] Add more diagnostics for NonlinearComponent --- src/nnet3/nnet-component-itf.cc | 36 ++++++++++++++++++++++++++++----- src/nnet3/nnet-component-itf.h | 5 +++++ src/nnet3/nnet-parse.cc | 5 +++++ src/nnet3/nnet-parse.h | 2 ++ 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 8a52b7b788c..d0319403b10 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -393,11 +393,16 @@ std::string NonlinearComponent::Info() const { value_avg.Scale(1.0 / count_); stream << ", value-avg=" << SummarizeVector(value_avg); if (deriv_sum_.Dim() == dim_) { - Vector deriv_avg_dbl(deriv_sum_); - Vector deriv_avg(deriv_avg_dbl); + Vector deriv_avg(deriv_sum_); deriv_avg.Scale(1.0 / count_); stream << ", deriv-avg=" << SummarizeVector(deriv_avg); } + if (oderiv_sumsq_.Dim() == dim_) { + Vector oderiv_rms(oderiv_sumsq_); + oderiv_rms.Scale(1.0 / count_); + oderiv_rms.ApplyPow(0.5); + stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms); + } } return stream.str(); } @@ -405,6 +410,7 @@ std::string NonlinearComponent::Info() const { void NonlinearComponent::Scale(BaseFloat scale) { value_sum_.Scale(scale); deriv_sum_.Scale(scale); + oderiv_sumsq_.Scale(scale); count_ *= scale; num_dims_self_repaired_ *= scale; num_dims_processed_ *= scale; @@ -418,10 +424,14 @@ void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) { value_sum_.Resize(other->value_sum_.Dim()); if (deriv_sum_.Dim() == 0 && other->deriv_sum_.Dim() != 0) deriv_sum_.Resize(other->deriv_sum_.Dim()); + if (oderiv_sumsq_.Dim() == 0 && other->oderiv_sumsq_.Dim() != 0) + oderiv_sumsq_.Resize(other->oderiv_sumsq_.Dim()); if (other->value_sum_.Dim() != 0) value_sum_.AddVec(alpha, other->value_sum_); if (other->deriv_sum_.Dim() != 0) deriv_sum_.AddVec(alpha, other->deriv_sum_); + if (other->oderiv_sumsq_.Dim() != 0) + oderiv_sumsq_.AddVec(alpha, other->oderiv_sumsq_); count_ += alpha * other->count_; num_dims_self_repaired_ += alpha * other->num_dims_self_repaired_; num_dims_processed_ += alpha * other->num_dims_processed_; @@ -443,10 +453,18 @@ void NonlinearComponent::Read(std::istream &is, bool binary) { value_sum_.Read(is, binary); ExpectToken(is, binary, ""); deriv_sum_.Read(is, binary); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + oderiv_sumsq_.Read(is, binary); + oderiv_sumsq_.ApplyPow(2.0); + } else { + oderiv_sumsq_.Resize(deriv_sum_.Dim()); + } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &count_); value_sum_.Scale(count_); deriv_sum_.Scale(count_); + oderiv_sumsq_.Scale(count_); std::string token; ReadToken(is, binary, &token); @@ -493,12 +511,20 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const { Vector temp(value_sum_); if (count_ != 0.0) temp.Scale(1.0 / count_); temp.Write(os, binary); - WriteToken(os, binary, ""); - temp.Resize(deriv_sum_.Dim(), kUndefined); + WriteToken(os, binary, ""); + temp.Resize(deriv_sum_.Dim()); temp.CopyFromVec(deriv_sum_); if (count_ != 0.0) temp.Scale(1.0 / count_); temp.Write(os, binary); + + WriteToken(os, binary, ""); + temp.Resize(oderiv_sumsq_.Dim()); + temp.CopyFromVec(oderiv_sumsq_); + if (count_ != 0.0) temp.Scale(1.0 / count_); + temp.ApplyPow(0.5); + temp.Write(os, binary); + WriteToken(os, binary, ""); WriteBasicType(os, binary, count_); WriteToken(os, binary, ""); @@ -530,7 +556,7 @@ NonlinearComponent::NonlinearComponent(): NonlinearComponent::NonlinearComponent(const NonlinearComponent &other): dim_(other.dim_), block_dim_(other.block_dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_), - count_(other.count_), + oderiv_sumsq_(other.oderiv_sumsq_), count_(other.count_), num_dims_self_repaired_(other.num_dims_self_repaired_), num_dims_processed_(other.num_dims_processed_), self_repair_lower_threshold_(other.self_repair_lower_threshold_), diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 565a7f25e74..c096a78325b 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -658,6 +658,11 @@ class NonlinearComponent: public Component { CuVector deriv_sum_; // stats of the derivative of the nonlinearity // (only applicable to element-by-element // nonlinearities, not Softmax. + CuVector oderiv_sumsq_; // Sum-square of the derivative of the + // objective function, that we're propagating + // back. Accumulated during the backprop; + // used for diagnostics. + double count_; // some stats for self-repairing nonlinearities. diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index 52d47876c8a..bb3a209460a 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -517,6 +517,11 @@ std::string SummarizeVector(const VectorBase &vec) { return os.str(); } +std::string SummarizeVector(const VectorBase &vec) { + Vector vec_copy(vec); + return SummarizeVector(vec_copy); +} + std::string SummarizeVector(const CuVectorBase &cu_vec) { Vector vec(cu_vec); return SummarizeVector(vec); diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index 5cfe080e422..0b2e0041aaa 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -196,6 +196,8 @@ std::string ErrorContext(const std::string &str); */ std::string SummarizeVector(const VectorBase &vec); +std::string SummarizeVector(const VectorBase &vec); + std::string SummarizeVector(const CuVectorBase &vec); /** Print to 'os' some information about the mean and standard deviation of From b5ad6ec6ef1f7feded82115c922431e9dee0355b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 1 Jan 2018 01:55:02 -0500 Subject: [PATCH 043/184] [src] nnet3 bug fixes RE oderiv-rms stats --- .../s5/steps/libs/nnet3/report/log_parse.py | 4 +- .../nnet3/train/chain_objf/acoustic_model.py | 4 +- egs/wsj/s5/steps/libs/nnet3/train/common.py | 2 +- .../nnet3/train/frame_level_objf/common.py | 10 ++- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 56 ++++++++----- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 3 +- .../libs/nnet3/xconfig/trivial_layers.py | 53 ++++++++++++ egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py | 50 ++++++++++- egs/wsj/s5/steps/nnet3/chain/train.py | 5 +- src/cudamatrix/cu-allocator.h | 2 +- src/nnet3/nnet-component-itf.cc | 83 ++++++++++++------- src/nnet3/nnet-component-itf.h | 16 ++-- src/nnet3/nnet-component-test.cc | 6 +- 13 files changed, 215 insertions(+), 79 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index d5f2575d582..905edc1a78b 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -388,8 +388,8 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): " key {k} in both {tl} and {vl}".format( k=key, tl=train_prob_files, vl=valid_prob_files)) iters.sort() - return map(lambda x: (int(x), float(train_objf[x]), - float(valid_objf[x])), iters) + return list(map(lambda x: (int(x), float(train_objf[x]), + float(valid_objf[x])), iters)) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 1ec9a09b571..c63901367d6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -387,8 +387,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), - range(1, num_lda_jobs + 1)) + lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1))) common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 2b4fdd92cec..6c194a2c0a1 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -531,7 +531,7 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power)) num_pdfs = len(pdf_counts) - scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales) + scaled_counts = list(map(lambda x: x * float(num_pdfs) / sum(scales), scales)) return scaled_counts diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index aa100e6af91..9f9b5752ce6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -9,6 +9,8 @@ network without transition model) with frame-level objectives. """ +from __future__ import print_statement +from __future__ import division import glob import logging import math @@ -91,7 +93,7 @@ def train_new_models(dir, iter, srand, num_jobs, archive_index = (k % num_archives) + 1 if not chunk_level_training: - frame = (k / num_archives + archive_index) % frames_per_eg + frame = (k // num_archives + archive_index) % frames_per_eg cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) @@ -344,8 +346,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), - range(1, num_lda_jobs + 1)) + lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1))) common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ @@ -576,7 +578,7 @@ def get_realign_iters(realign_times, num_iters, + realign_time * math.pow(num_jobs_final, 2)) realign_iter = realign_iter - num_jobs_initial - realign_iter = realign_iter / (num_jobs_final - num_jobs_initial) + realign_iter = realign_iter // (num_jobs_final - num_jobs_initial) realign_iter = realign_iter * num_iters realign_iters.append(int(realign_iter)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index e66d38a3dc4..85454795435 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -609,9 +609,10 @@ def set_default_configs(self): 'clipping-threshold' : 30.0, 'zeroing-interval' : 20, 'zeroing-threshold' : 15.0, - # recurrence-scale is a scale we put on the c_t when doing linear projections - # from it... making it larger than 1 (e.g. 4) helps equalize scales. - 'recurrence-scale': 1.0, + # self-scale is a scale we put on the m_t when doing + # linear projections from it... making it larger than 1 + # (e.g. 4) helps equalize scales. + 'self-scale': 1.0, 'delay' : -1, # if you want to set 'self-repair-scale' (c.f. the # self-repair-scale-nonlinearity config value in older LSTM layers), you can @@ -748,7 +749,7 @@ def _generate_lstm_config(self): configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format( - name, input_descriptor, self.config['recurrence-scale'], delay)) + name, input_descriptor, self.config['self-scale'], delay)) if self.config['self-stabilize']: configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name)) W_all_name = 'W_all_so' @@ -825,18 +826,23 @@ def set_default_configs(self): self.config = { 'input':'[-1]', 'cell-dim' : -1, # this is a required argument 'bottleneck-dim': -1, # this is a required argument - 'clipping-threshold' : 30.0, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, + 'clipping-threshold': 30.0, + 'zeroing-interval': 20, + 'zeroing-threshold': 15.0, + # batchnorm-power is for what i'm going to call OverNorm, you can set it + # for example to -0.75. + 'batchnorm-power': -0.5, 'delay' : -1, 'lstm-nonlinearity-options' : ' max-change=0.75', + # the recurrence scale is the scale on m_trunc, used in the + # recurrence (to balance its size with the input). + 'self-scale' : 1.0, # the affine layer contains 4 of our old layers -> use a # larger max-change than the normal value of 0.75. 'ng-affine-options' : ' max-change=1.5', 'l2-regularize': 0.0, 'decay-time': -1.0 } - self.c_needed = False # keep track of whether the 'c' output is needed. def set_derived_configs(self): if self.config['cell-dim'] <= 0: @@ -884,6 +890,7 @@ def _generate_lstm_config(self): input_descriptor = self.descriptors['input']['final-string'] cell_dim = self.config['cell-dim'] bottleneck_dim = self.config['bottleneck-dim'] + self_scale = self.config['self-scale'] delay = self.config['delay'] affine_str = self.config['ng-affine-options'] l2_regularize = self.config['l2-regularize'] @@ -917,7 +924,7 @@ def _generate_lstm_config(self): # constraint, it's meaningless. configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} " - "orthonormal-constraint=1.0 output-dim={2} {3} ".format( + "orthonormal-constraint=1.0 output-dim={2} {3}".format( name, input_dim + cell_dim, bottleneck_dim, affine_str)) @@ -936,16 +943,15 @@ def _generate_lstm_config(self): l2_regularize_option)) configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format( - name, cell_dim, bptrunc_str)) - configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( - name, cell_dim)) - + configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( + name, 2 * cell_dim, bptrunc_str)) + configs.append("component name={0}.m_batchnorm type=BatchNormComponent power={1} dim={2} ".format( + name, self.config['batchnorm-power'], cell_dim)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset({0}.c_trunc, {2})))".format( - name, input_descriptor, delay)) + "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format( + name, input_descriptor, self_scale, delay)) configs.append("component-node name={0}.W_all_b component={0}.W_all_b " "input={0}.W_all_a".format(name)) configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so " @@ -954,11 +960,13 @@ def _generate_lstm_config(self): configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( name, delay)) - configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 " - "dim={1}".format(name, cell_dim)) configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " "dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name)) + configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) + configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 " + "dim={1}".format(name, cell_dim)) + configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} " + "dim={1}".format(name, cell_dim)) configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " "input={0}.m".format(name)) configs.append("### End LTSM layer '{0}'".format(name)) @@ -1151,15 +1159,17 @@ def _generate_lstm_config(self): if self.config['self-stabilize']: # have LinearComponent followed by ScaleAndOffsetComponent. configs.append("component name={0}.W_all type=LinearComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4, - affine_str, l2_regularize_option)) + "output-dim={2} {3} {4} ".format( + name, input_dim + rec_proj_dim, cell_dim * 4, + affine_str, l2_regularize_option)) configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} " "max-change=0.75".format(name, cell_dim * 4)) else: # have NaturalGradientAffineComponent configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4, - affine_str, l2_regularize_option)) + "output-dim={2} {3} {4}".format( + name, input_dim + rec_proj_dim, cell_dim * 4, + affine_str, l2_regularize_option)) configs.append("# The core LSTM nonlinearity, implemented as a single component.") configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 71205961681..c6b0619bca8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -68,7 +68,8 @@ 'opgru-layer' : xlayers.XconfigOpgruLayer, 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, - 'renorm-component': xlayers.XconfigRenormComponent + 'renorm-component': xlayers.XconfigRenormComponent, + 'no-op-component': xlayers.XconfigNoOpComponent } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 80a2b7df418..ef05887e469 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -68,3 +68,56 @@ def _generate_config(self): self.name, input_desc)) configs.append(line) return configs + + +class XconfigNoOpComponent(XconfigLayerBase): + """This class is for parsing lines like + 'no-op-component name=renorm input=Append(-3,0,3)' + which will produce just a single component, of type NoOpComponent. + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]' } + + def check_configs(self): + pass + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + + configs = [] + line = ('component name={0} type=NoOpComponent dim={1}'.format( + self.name, input_dim)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py index 9ff7f1e2258..08de18167cd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py @@ -6,6 +6,7 @@ # while xconfig_layers.py contains the code specific to layer types. from __future__ import print_function +from __future__ import division import re import sys @@ -277,6 +278,12 @@ def dim(self, layer_to_dim): return self.items[0].dim(layer_to_dim) elif self.operator == 'Append': return sum([ x.dim(layer_to_dim) for x in self.items]) + elif self.operator == 'Scale': + # e.g. Scale(2.0, lstm1). Return dim of 2nd arg. + return self.items[1].dim(layer_to_dim) + elif self.operator == 'Const': + # e.g. Const(0.5, 512). Return 2nd arg, which is an int. + return self.items[1] else: raise RuntimeError("Unknown operator {0}".format(self.operator)) @@ -312,7 +319,8 @@ def parse_new_descriptor(tokens, pos, prev_names): # when reading this function, be careful to note the indent level, # there is an if-statement within an if-statement. - if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', + 'Switch', 'Failover', 'IfDefined' ]: expect_token('(', tokens[pos], first_token + '()') pos += 1 d.operator = first_token @@ -392,6 +400,38 @@ def parse_new_descriptor(tokens, pos, prev_names): pos += 1 else: raise RuntimeError("code error") + elif first_token in ['Scale', 'Const' ]: + # Parsing something like 'Scale(2.0, lstm1)' or 'Const(1.0, 512)' + expect_token('(', tokens[pos], first_token + '()') + pos += 1 + d.operator = first_token + # First arg of Scale() and Const() is a float: the scale or value, + # respectively. + try: + value = float(tokens[pos]) + pos += 1 + d.items = [value] + except: + raise RuntimeError("Parsing {0}, expected float, got {1}".format( + first_token, tokens[pos])) + # Consume the comma. + expect_token(',', tokens[pos], first_token + '()') + pos += 1 + if first_token == 'Scale': + # Second arg of Scale() is a Descriptor. + (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) + d.items.append(desc) + else: + assert first_token == 'Const' + try: + dim = int(tokens[pos]) + pos += 1 + d.items.append(dim) + except: + raise RuntimeError("Parsing Const() expression, expected int, got {0}".format( + tokens[pos])) + expect_token(')', tokens[pos], first_token) + pos += 1 elif first_token in [ 'end of string', '(', ')', ',', '@' ]: raise RuntimeError("Expected descriptor, got " + first_token) elif is_valid_line_name(first_token) or first_token == '[': @@ -555,7 +595,7 @@ def parse_config_line(orig_config_line): rest_of_line = ' '.join(fields) # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)' - positions = map(lambda x: x.start(), re.finditer('"', rest_of_line)) + positions = list(map(lambda x: x.start(), re.finditer('"', rest_of_line))) if not len(positions) % 2 == 0: raise RuntimeError("Double-quotes should occur in pairs") @@ -565,7 +605,7 @@ def parse_config_line(orig_config_line): # and replace the quotation marks themselves with spaces. # Then later on we'll convert all the question marks to # equals signs in the values in the dicts. - num_strings = len(positions) / 2 + num_strings = len(positions) // 2 fields = [] for i in range(num_strings): start = positions[i * 2] @@ -588,7 +628,7 @@ def parse_config_line(orig_config_line): if not (other_fields[0] == '' and len(other_fields) % 2 == 1): raise RuntimeError("Could not parse config line."); fields += other_fields[1:] - num_variables = len(fields) / 2 + num_variables = len(fields) // 2 for i in range(num_variables): var_name = fields[i * 2] var_value = fields[i * 2 + 1] @@ -634,6 +674,8 @@ def test_library(): ('Append(-3,0,3)', 'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'), ('[-1]', 'prev_layer'), + ('Scale(2.0,foo)', 'Scale(2.0, foo)'), + ('Const(0.5,500)', 'Const(0.5, 500)'), ('[-2]', 'last_but_one_layer'), ('[-2]@3', 'Offset(last_but_one_layer, 3)') ]: diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 7f607abd8dc..e52d2ecee20 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -522,7 +522,7 @@ def train(args, run_opts): backstitch_training_interval=args.backstitch_training_interval) if args.cleanup: - # do a clean up everythin but the last 2 models, under certain + # do a clean up everything but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, @@ -573,8 +573,9 @@ def train(args, run_opts): # delete it remove_egs = False + # leave the last-two-numbered models, for diagnostic reasons. common_train_lib.clean_nnet_dir( - args.dir, num_iters, egs_dir, + args.dir, num_iters - 1, egs_dir, preserve_model_interval=args.preserve_model_interval, remove_egs=remove_egs) diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index f2ccf0d6c29..0f96315e848 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -54,7 +54,7 @@ struct CuAllocatorOptions { // is a constant overhead proportional to the number of buckets. BaseFloat delete_factor; - CuAllocatorOptions(): memory_factor(1.5), + CuAllocatorOptions(): memory_factor(1.3), delete_factor(0.001) { } void Check() { diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index d0319403b10..e2a316835fd 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -332,24 +332,23 @@ std::string UpdatableComponent::Info() const { void NonlinearComponent::StoreStatsInternal( const CuMatrixBase &out_value, const CuMatrixBase *deriv) { - KALDI_ASSERT(out_value.NumCols() == InputDim()); + KALDI_ASSERT(out_value.NumCols() == dim_); // Check we have the correct dimensions. - if (value_sum_.Dim() != InputDim() || - (deriv != NULL && deriv_sum_.Dim() != InputDim())) { - std::lock_guard lock(mutex_); - if (value_sum_.Dim() != InputDim()) { - value_sum_.Resize(InputDim()); + if (value_sum_.Dim() != dim_ || + (deriv != NULL && deriv_sum_.Dim() != dim_)) { + if (value_sum_.Dim() != dim_) { + value_sum_.Resize(dim_); count_ = 0.0; } - if (deriv != NULL && deriv_sum_.Dim() != InputDim()) { - deriv_sum_.Resize(InputDim()); + if (deriv != NULL && deriv_sum_.Dim() != dim_) { + deriv_sum_.Resize(dim_); count_ = 0.0; value_sum_.SetZero(); } } count_ += out_value.NumRows(); - CuVector temp(InputDim()); + CuVector temp(dim_); temp.AddRowSumMat(1.0, out_value, 0.0); value_sum_.AddVec(1.0, temp); if (deriv != NULL) { @@ -358,22 +357,35 @@ void NonlinearComponent::StoreStatsInternal( } } +void NonlinearComponent::StoreBackpropStats( + const CuMatrixBase &out_deriv) { + KALDI_ASSERT(out_deriv.NumCols() == dim_); + + // Check we have the correct dimensions. + if (oderiv_sumsq_.Dim() != dim_) { + oderiv_sumsq_.Resize(dim_); + oderiv_count_ = 0.0; + } + CuVector temp(dim_); + temp.AddDiagMat2(1.0, out_deriv, kTrans, 0.0); + oderiv_sumsq_.AddVec(1.0, temp); + oderiv_count_ += out_deriv.NumRows(); +} + + void NonlinearComponent::ZeroStats() { value_sum_.SetZero(); deriv_sum_.SetZero(); + oderiv_sumsq_.SetZero(); count_ = 0.0; + oderiv_count_ = 0.0; num_dims_self_repaired_ = 0.0; num_dims_processed_ = 0.0; } std::string NonlinearComponent::Info() const { std::stringstream stream; - if (InputDim() == OutputDim()) { - stream << Type() << ", dim=" << InputDim(); - } else { - stream << Type() << ", input-dim=" << InputDim() - << ", output-dim=" << OutputDim(); - } + stream << Type() << ", dim=" << dim_; if (block_dim_ != dim_) stream << ", block-dim=" << block_dim_; if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold)) @@ -397,12 +409,13 @@ std::string NonlinearComponent::Info() const { deriv_avg.Scale(1.0 / count_); stream << ", deriv-avg=" << SummarizeVector(deriv_avg); } - if (oderiv_sumsq_.Dim() == dim_) { - Vector oderiv_rms(oderiv_sumsq_); - oderiv_rms.Scale(1.0 / count_); - oderiv_rms.ApplyPow(0.5); - stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms); - } + } + if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) { + Vector oderiv_rms(oderiv_sumsq_); + oderiv_rms.Scale(1.0 / oderiv_count_); + oderiv_rms.ApplyPow(0.5); + stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms) + << ", oderiv-count=" << oderiv_count_; } return stream.str(); } @@ -412,6 +425,7 @@ void NonlinearComponent::Scale(BaseFloat scale) { deriv_sum_.Scale(scale); oderiv_sumsq_.Scale(scale); count_ *= scale; + oderiv_count_ *= scale; num_dims_self_repaired_ *= scale; num_dims_processed_ *= scale; } @@ -433,6 +447,7 @@ void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) { if (other->oderiv_sumsq_.Dim() != 0) oderiv_sumsq_.AddVec(alpha, other->oderiv_sumsq_); count_ += alpha * other->count_; + oderiv_count_ += alpha * other->oderiv_count_; num_dims_self_repaired_ += alpha * other->num_dims_self_repaired_; num_dims_processed_ += alpha * other->num_dims_processed_; } @@ -453,18 +468,21 @@ void NonlinearComponent::Read(std::istream &is, bool binary) { value_sum_.Read(is, binary); ExpectToken(is, binary, ""); deriv_sum_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &count_); if (PeekToken(is, binary) == 'O') { ExpectToken(is, binary, ""); oderiv_sumsq_.Read(is, binary); oderiv_sumsq_.ApplyPow(2.0); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &oderiv_count_); } else { - oderiv_sumsq_.Resize(deriv_sum_.Dim()); + oderiv_count_ = 0.0; + oderiv_sumsq_.Resize(0); } - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &count_); value_sum_.Scale(count_); deriv_sum_.Scale(count_); - oderiv_sumsq_.Scale(count_); + oderiv_sumsq_.Scale(oderiv_count_); std::string token; ReadToken(is, binary, &token); @@ -518,15 +536,19 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const { if (count_ != 0.0) temp.Scale(1.0 / count_); temp.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, count_); + WriteToken(os, binary, ""); temp.Resize(oderiv_sumsq_.Dim()); temp.CopyFromVec(oderiv_sumsq_); - if (count_ != 0.0) temp.Scale(1.0 / count_); + if (oderiv_count_ != 0.0) temp.Scale(1.0 / oderiv_count_); temp.ApplyPow(0.5); temp.Write(os, binary); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, count_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, oderiv_count_); + WriteToken(os, binary, ""); WriteBasicType(os, binary, num_dims_self_repaired_); WriteToken(os, binary, ""); @@ -547,7 +569,7 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const { } NonlinearComponent::NonlinearComponent(): - dim_(-1), block_dim_(-1), count_(0.0), + dim_(-1), block_dim_(-1), count_(0.0), oderiv_count_(0.0), num_dims_self_repaired_(0.0), num_dims_processed_(0.0), self_repair_lower_threshold_(kUnsetThreshold), self_repair_upper_threshold_(kUnsetThreshold), @@ -556,7 +578,8 @@ NonlinearComponent::NonlinearComponent(): NonlinearComponent::NonlinearComponent(const NonlinearComponent &other): dim_(other.dim_), block_dim_(other.block_dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_), - oderiv_sumsq_(other.oderiv_sumsq_), count_(other.count_), + count_(other.count_), oderiv_sumsq_(other.oderiv_sumsq_), + oderiv_count_(other.oderiv_count_), num_dims_self_repaired_(other.num_dims_self_repaired_), num_dims_processed_(other.num_dims_processed_), self_repair_lower_threshold_(other.self_repair_lower_threshold_), diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index c096a78325b..c34d550d681 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -590,7 +590,7 @@ class UpdatableComponent: public Component { block-dim Defaults to dim, but may be any nonzero divisor of dim. It affects the self-repair, which will be done while treating the input/output as - repeating blocks of size 'block-dim' (e.g. blocks of filtes). It allows + repeating blocks of size 'block-dim' (e.g. blocks of filters). It allows us to do self-repair on the filter level in CNNs. Currently this only makes a difference for RectifiedLinearComponent. */ @@ -643,6 +643,10 @@ class NonlinearComponent: public Component { void StoreStatsInternal(const CuMatrixBase &out_value, const CuMatrixBase *deriv = NULL); + // This function may be called from child class members during backprop. It + // stores the 'oderiv_sumsq_' stats. + void StoreBackpropStats(const CuMatrixBase &out_deriv); + const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow. @@ -658,12 +662,15 @@ class NonlinearComponent: public Component { CuVector deriv_sum_; // stats of the derivative of the nonlinearity // (only applicable to element-by-element // nonlinearities, not Softmax. + // Count corresponding to the stats in 'value_sum_' and 'deriv_sum_' + double count_; + CuVector oderiv_sumsq_; // Sum-square of the derivative of the // objective function, that we're propagating // back. Accumulated during the backprop; // used for diagnostics. - - double count_; + // Count corresponding to the stats in 'oderiv_sumsq_'. + double oderiv_count_; // some stats for self-repairing nonlinearities. double num_dims_self_repaired_; @@ -673,9 +680,6 @@ class NonlinearComponent: public Component { BaseFloat self_repair_lower_threshold_; BaseFloat self_repair_upper_threshold_; BaseFloat self_repair_scale_; - - // The mutex is used in UpdateStats, only for resizing vectors. - std::mutex mutex_; }; } // namespace nnet3 diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc index 882ef112919..d7595378c1f 100644 --- a/src/nnet3/nnet-component-test.cc +++ b/src/nnet3/nnet-component-test.cc @@ -274,7 +274,7 @@ bool TestSimpleComponentDataDerivative(const Component &c, int32 input_dim = c.InputDim(), output_dim = c.OutputDim(), - num_rows = RandInt(1, 20), + num_rows = RandInt(1, 100), rand_seed = Rand(); int32 properties = c.Properties(); CuMatrix input_data(num_rows, input_dim, kSetZero, input_stride_type), @@ -317,7 +317,7 @@ bool TestSimpleComponentDataDerivative(const Component &c, } KALDI_LOG << "Predicted objf-change = " << predicted_objf_change; KALDI_LOG << "Measured objf-change = " << measured_objf_change; - BaseFloat threshold = 0.05; + BaseFloat threshold = 0.1; bool ans = ApproxEqual(predicted_objf_change, measured_objf_change, threshold); if (!ans) KALDI_WARN << "Data-derivative test failed, component-type=" @@ -442,7 +442,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, void UnitTestNnetComponent() { - for (int32 n = 0; n < 2000; n++) { + for (int32 n = 0; n < 200; n++) { Component *c = GenerateRandomSimpleComponent(); KALDI_LOG << c->Info(); TestNnetComponentIo(c); From 9f362a0455e08a78b971856c468ec1e9e2615fc9 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 4 Jan 2018 19:09:22 -0500 Subject: [PATCH 044/184] [src] Finish code for oderiv-rms stats. --- src/nnet3/nnet-component-itf.cc | 10 ++++++++++ src/nnet3/nnet-simple-component.cc | 26 ++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index e2a316835fd..0a82a592102 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -359,6 +359,10 @@ void NonlinearComponent::StoreStatsInternal( void NonlinearComponent::StoreBackpropStats( const CuMatrixBase &out_deriv) { + // only store these stats about every 4 minibatches. + if (RandInt(0, 3) == 0) + return; + KALDI_ASSERT(out_deriv.NumCols() == dim_); // Check we have the correct dimensions. @@ -413,6 +417,9 @@ std::string NonlinearComponent::Info() const { if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) { Vector oderiv_rms(oderiv_sumsq_); oderiv_rms.Scale(1.0 / oderiv_count_); + // The ApplyMin() is so that the statement after it does not fail even if we + // had subtracted models (e.g. in full_progress.*.log). + oderiv_rms.ApplyFloor(0.0); oderiv_rms.ApplyPow(0.5); stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms) << ", oderiv-count=" << oderiv_count_; @@ -543,6 +550,9 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const { temp.Resize(oderiv_sumsq_.Dim()); temp.CopyFromVec(oderiv_sumsq_); if (oderiv_count_ != 0.0) temp.Scale(1.0 / oderiv_count_); + // The ApplyMin() is so that the statement after it does not fail even if we + // had subtracted models (e.g. in full_progress.*.log). + temp.ApplyFloor(0.0); temp.ApplyPow(0.5); temp.Write(os, binary); diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 34d24a39f24..5bd6ffeee32 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -331,8 +331,10 @@ void SigmoidComponent::Backprop(const std::string &debug_info, if (in_deriv != NULL) { in_deriv->DiffSigmoid(out_value, out_deriv); SigmoidComponent *to_update = dynamic_cast(to_update_in); - if (to_update != NULL) + if (to_update != NULL) { RepairGradients(out_value, in_deriv, to_update); + to_update->StoreBackpropStats(out_deriv); + } } } @@ -839,8 +841,10 @@ void TanhComponent::Backprop(const std::string &debug_info, if (in_deriv != NULL) { in_deriv->DiffTanh(out_value, out_deriv); TanhComponent *to_update = dynamic_cast(to_update_in); - if (to_update != NULL) + if (to_update != NULL) { RepairGradients(out_value, in_deriv, to_update); + to_update->StoreBackpropStats(out_deriv); + } } } @@ -889,8 +893,10 @@ void RectifiedLinearComponent::Backprop( in_deriv->MulElements(out_deriv); RectifiedLinearComponent *to_update = dynamic_cast(to_update_in); - if (to_update != NULL) + if (to_update != NULL) { RepairGradients(in_deriv, to_update); + to_update->StoreBackpropStats(out_deriv); + } } } @@ -3404,6 +3410,13 @@ void SoftmaxComponent::Backprop(const std::string &debug_info, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { + + if (to_update_in) { + SoftmaxComponent *to_update = + dynamic_cast(to_update_in); + to_update->StoreBackpropStats(out_deriv); + } + if (in_deriv == NULL) return; /* @@ -3443,8 +3456,13 @@ void LogSoftmaxComponent::Backprop(const std::string &debug_info, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, - Component *, // to_update + Component *to_update_in, CuMatrixBase *in_deriv) const { + if (to_update_in) { + LogSoftmaxComponent *to_update = + dynamic_cast(to_update_in); + to_update->StoreBackpropStats(out_deriv); + } if (in_deriv == NULL) return; in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv); From 8e5f520a94920688e3cb98f34b809385225da899 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 4 Jan 2018 19:10:09 -0500 Subject: [PATCH 045/184] [src] Work around problem related to ungetc failures on ifstream --- src/base/io-funcs.cc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc index 8b8662b6401..90988faf3ea 100644 --- a/src/base/io-funcs.cc +++ b/src/base/io-funcs.cc @@ -178,8 +178,14 @@ int PeekToken(std::istream &is, bool binary) { } int ans = is.peek(); if (read_bracket) { - if (!is.unget()) + if (!is.unget()) { KALDI_WARN << "Error ungetting '<' in PeekToken"; + // Clear the bad bit. It seems to be possible for this code to be + // reached, and the C++ standard is very vague on whether even a single + // call to unget() should succeed; see + // http://www.cplusplus.com/reference/istream/istream/unget/ + is.clear(); + } } return ans; } @@ -197,7 +203,12 @@ void ExpectToken(std::istream &is, bool binary, const char *token) { KALDI_ERR << "Failed to read token [started at file position " << pos_at_start << "], expected " << token; } - if (strcmp(str.c_str(), token) != 0) { + // The second half of the '&&' expression below is so that if we're expecting + // "", we will accept "Foo>" instead. This is so that the model-reading + // code will tolerate errors in PeekToken where is.unget() failed; search for + // is.clear() in PeekToken() for an explanation. + if (strcmp(str.c_str(), token) != 0 && + !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str <<"\"."; } From b76f02abd051ec54427148fa118f6c57a394c4b4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 4 Jan 2018 19:11:00 -0500 Subject: [PATCH 046/184] [scripts] improve messages in chain training. --- egs/wsj/s5/steps/nnet3/chain/train.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index e52d2ecee20..011b6894938 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -216,15 +216,13 @@ def process_args(args): "--trainer.deriv-truncate-margin.".format( args.deriv_truncate_margin)) - if (not os.path.exists(args.dir) - or (not os.path.exists(args.dir+"/configs") and - not os.path.exists(args.input_model))): - raise Exception("This script expects {0} to exist. Also either " - "--trainer.input-model option as initial 'raw' model " - "(used as 0.raw in the script) should be supplied or " - "{0}/configs directory which is the output of " - "make_configs.py script should be provided." - "".format(args.dir)) + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist.") + if args.transform_dir is None: args.transform_dir = args.lat_dir From 414d33c5c9198276ac6b301c3f547b08a1bc91fa Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Sun, 31 Dec 2017 12:03:33 -0800 Subject: [PATCH 047/184] [src] Make faster ApplyFloor and ApplyCeiling variants for Matrix and CuMatrix. (#2115) ( breaks backwards compatibility, since the signatures of these methods change). --- src/cudamatrix/cu-kernels.cu | 8 +- src/cudamatrix/cu-vector-speed-test.cc | 93 ++++++++++++++++++++++ src/cudamatrix/cu-vector-test.cc | 44 ++++++++-- src/cudamatrix/cu-vector.cc | 63 +++++++++------ src/cudamatrix/cu-vector.h | 5 +- src/feat/feature-functions.cc | 3 +- src/gmm/mle-diag-gmm.cc | 2 +- src/ivector/ivector-extractor.cc | 6 +- src/ivector/plda.cc | 3 +- src/matrix/kaldi-vector.cc | 42 ++++++---- src/matrix/kaldi-vector.h | 11 ++- src/matrix/matrix-lib-test.cc | 17 +++- src/nnet2/get-feature-transform.cc | 3 +- src/nnet2/nnet-precondition-online-test.cc | 5 +- src/nnet2/nnet-precondition-online.cc | 3 +- src/nnet3/natural-gradient-online-test.cc | 5 +- src/nnet3/natural-gradient-online.cc | 3 +- src/nnet3/nnet-general-component.cc | 3 +- src/nnet3/nnet-simple-component.cc | 3 +- src/sgmm2/am-sgmm2.cc | 3 +- 20 files changed, 254 insertions(+), 71 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 4ebdcf6c988..2f8f37224be 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -7,7 +7,7 @@ // 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen // 2016-2017 Shiyin Kang -// 2017 Hossein Hadian +// 2017 Hossein Hadian, Daniel Galvez // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -1879,8 +1879,7 @@ static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) { int index = i + j * d.stride; if (i < d.cols && j < d.rows) { - if (mat[index] < floor_val) - mat[index] = floor_val; + mat[index] = max(mat[index], floor_val); } } @@ -2036,8 +2035,7 @@ static void _apply_ceiling(Real* mat, Real ceiling_val, MatrixDim d) { int index = i + j * d.stride; if (i < d.cols && j < d.rows) { - if (mat[index] > ceiling_val) - mat[index] = ceiling_val; + mat[index] = min(mat[index], ceiling_val); } } diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc index 7227c25c0b1..b5efda3d8de 100644 --- a/src/cudamatrix/cu-vector-speed-test.cc +++ b/src/cudamatrix/cu-vector-speed-test.cc @@ -1,6 +1,7 @@ // cudamatrix/cu-vector-speed-test.cc // Copyright 2013 Johns Hopkins University (author: Daniel Povey) +// Copyright 2017 Daniel Galvez // See ../../COPYING for clarification regarding multiple authors // @@ -256,6 +257,90 @@ template void TestCuVectorAddColSumMat(int32 dim, MatrixTranspose } +template void TestCuVectorApplyFloor(int32 dim) { + BaseFloat time_in_secs = 0.02; + CuVector v(dim); + v.SetRandn(); + Real threshold = RandInt(-35000, 35000) / Real(100); + + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + MatrixIndexT dummy_count; + v.ApplyFloor(threshold, &dummy_count); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuVector::ApplyFloor" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; + +} + + +template void TestCuVectorApplyFloorNoCount(int32 dim) { + BaseFloat time_in_secs = 0.02; + CuVector v(dim); + v.SetRandn(); + Real threshold = RandInt(-35000, 35000) / Real(100); + + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + v.ApplyFloor(threshold, nullptr); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuVector::ApplyFloor (no count variety)" << NameOf() + << ", for dim = " << dim << ", speed was " << gflops + << " gigaflops."; + +} + + +template void TestCuVectorApplyCeiling(int32 dim) { + BaseFloat time_in_secs = 0.02; + CuVector v(dim); + v.SetRandn(); + Real threshold = RandInt(-35000, 35000) / Real(100); + + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + MatrixIndexT dummy_count; + v.ApplyCeiling(threshold, &dummy_count); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuVector::ApplyCeiling" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; + +} + + +template void TestCuVectorApplyCeilingNoCount(int32 dim) { + BaseFloat time_in_secs = 0.02; + CuVector v(dim); + v.SetRandn(); + Real threshold = RandInt(-35000, 35000) / Real(100); + + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + v.ApplyCeiling(threshold, nullptr); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuVector::ApplyCeiling (no count variety)" << NameOf() + << ", for dim = " << dim << ", speed was " << gflops + << " gigaflops."; + +} + + template void CudaVectorSpeedTest() { std::vector sizes; sizes.push_back(16); @@ -296,6 +381,14 @@ template void CudaVectorSpeedTest() { TestCuVectorAddColSumMat(sizes[s], kNoTrans); TestCuVectorAddColSumMat(sizes[s], kTrans); } + for (int32 s = 0; s < ns; s++) { + TestCuVectorApplyFloor(sizes[s]); + TestCuVectorApplyFloorNoCount(sizes[s]); + } + for (int32 s = 0; s < ns; s++) { + TestCuVectorApplyCeiling(sizes[s]); + TestCuVectorApplyCeilingNoCount(sizes[s]); + } } diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc index 174a2dca6bf..0aa8ae931a4 100644 --- a/src/cudamatrix/cu-vector-test.cc +++ b/src/cudamatrix/cu-vector-test.cc @@ -2,7 +2,7 @@ // Copyright 2013 Lucas Ondel // 2013 Johns Hopkins University (author: Daniel Povey) -// 2017 Hossein Hadian +// 2017 Hossein Hadian, Daniel Galvez // See ../../COPYING for clarification regarding multiple authors // @@ -550,8 +550,9 @@ template void CuVectorUnitTestApplyFloor() { Vector vector(cu_vector); BaseFloat floor = 0.33 * (-5 + Rand() % 10); - int32 i = cu_vector.ApplyFloor(floor); - int32 j = vector.ApplyFloor(floor); + MatrixIndexT i, j; + cu_vector.ApplyFloor(floor, &i); + vector.ApplyFloor(floor, &j); CuVector cu2(vector); @@ -563,6 +564,21 @@ template void CuVectorUnitTestApplyFloor() { } } +template void CuVectorUnitTestApplyFloorNoCount() { + for (int32 l = 0; l < 10; l++) { + int32 dim = 100 + Rand() % 700; + CuVector cu_vector1(dim); + cu_vector1.SetRandn(); + CuVector cu_vector2(cu_vector1); + + BaseFloat floor = 0.33 * (-5 + Rand() % 10); + MatrixIndexT dummy_count; + cu_vector1.ApplyFloor(floor, &dummy_count); + cu_vector2.ApplyFloor(floor, nullptr); + AssertEqual(cu_vector1, cu_vector2); + } +} + template void CuVectorUnitTestApplyCeiling() { for (int32 l = 0; l < 10; l++) { int32 dim = 100 + Rand() % 700; @@ -571,8 +587,9 @@ template void CuVectorUnitTestApplyCeiling() { Vector vector(cu_vector); BaseFloat floor = 0.33 * (-5 + Rand() % 10); - int32 i = cu_vector.ApplyCeiling(floor); - int32 j = vector.ApplyCeiling(floor); + MatrixIndexT i, j; + cu_vector.ApplyCeiling(floor, &i); + vector.ApplyCeiling(floor, &j); CuVector cu2(vector); @@ -584,6 +601,21 @@ template void CuVectorUnitTestApplyCeiling() { } } +template void CuVectorUnitTestApplyCeilingNoCount() { + for (int32 l = 0; l < 10; l++) { + int32 dim = 100 + Rand() % 700; + CuVector cu_vector1(dim); + cu_vector1.SetRandn(); + CuVector cu_vector2(cu_vector1); + + BaseFloat floor = 0.33 * (-5 + Rand() % 10); + MatrixIndexT dummy_count; + cu_vector1.ApplyCeiling(floor, &dummy_count); + cu_vector2.ApplyCeiling(floor, nullptr); + AssertEqual(cu_vector1, cu_vector2); + } +} + template void CuVectorUnitTestApplyPow() { for (int32 l = 0; l < 10; l++) { int32 dim = 100 + Rand() % 700; @@ -770,6 +802,8 @@ template void CuVectorUnitTest() { CuVectorUnitTestApplyExp(); CuVectorUnitTestApplyLog(); CuVectorUnitTestApplyFloor(); + CuVectorUnitTestApplyFloorNoCount(); + CuVectorUnitTestApplyCeilingNoCount(); CuVectorUnitTestApplyCeiling(); CuVectorUnitTestApplyPow(); CuVectorUnitTestAddMatVec(); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index aa708142696..f61fd4408db 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -2,6 +2,7 @@ // Copyright 2012-2013 Karel Vesely // 2012-2014 Johns Hopkins University (author: Daniel Povey) +// 2017 Daniel Galvez // See ../../COPYING for clarification regarding multiple authors // @@ -342,52 +343,68 @@ void CuVectorBase::ApplySoftMax() { } template -MatrixIndexT CuVectorBase::ApplyFloor(Real floor_val) { - MatrixIndexT num_floored = 0; +void CuVectorBase::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - if (dim_ == 0) return 0; - CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); + if (floored_count == nullptr) { + if (dim_ == 0) return; + CuTimer tim; + // We are calling a function meant for matrices, by viewing the + // vector as a matrix with a single row. + ::MatrixDim dim = {1, Dim(), 1}; + cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, dim); + CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloorNoCount", tim); + } else { + if (dim_ == 0) { *floored_count = 0; return; } + CuTimer tim; - CuVector count_vec(dim_, kUndefined); + CuVector count_vec(dim_, kUndefined); - cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_); - CU_SAFE_CALL(cudaGetLastError()); - num_floored = count_vec.Sum(); - CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim); + cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_); + CU_SAFE_CALL(cudaGetLastError()); + *floored_count = count_vec.Sum(); + CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim); + } } else #endif { - num_floored = Vec().ApplyFloor(floor_val); + Vec().ApplyFloor(floor_val, floored_count); } - return num_floored; - } template -MatrixIndexT CuVectorBase::ApplyCeiling(Real ceiling_val) { - MatrixIndexT num_ceiled = 0; +void CuVectorBase::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - if (dim_ == 0) return 0; - CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); + if (ceiled_count == nullptr) { + if (dim_ == 0) return; + CuTimer tim; + // We are calling a function meant for matrices, by viewing the + // vector as a matrix with a single row. + ::MatrixDim dim = {1, Dim(), 1}; + cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, dim); + + CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeilingNoCount", tim); + } else { + if (dim_ == 0) { *ceiled_count = 0; return; } + CuTimer tim; - CuVector count_vec(dim_, kUndefined); + CuVector count_vec(dim_, kUndefined); - cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_); - CU_SAFE_CALL(cudaGetLastError()); - num_ceiled = count_vec.Sum(); - CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim); + cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_); + CU_SAFE_CALL(cudaGetLastError()); + *ceiled_count = count_vec.Sum(); + CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim); + } } else #endif { - num_ceiled = Vec().ApplyCeiling(ceiling_val); + Vec().ApplyCeiling(ceiling_val, ceiled_count); } - return num_ceiled; } template diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index 2c9768b2998..69ca2ae3125 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -5,6 +5,7 @@ // Lucas Ondel // 2013 Xiaohui Zhang // 2015 Guoguo Chen +// 2017 Daniel Galvez // See ../../COPYING for clarification regarding multiple authors // @@ -133,8 +134,8 @@ class CuVectorBase { void ApplySoftMax(); void ApplyExp(); void ApplyLog(); - MatrixIndexT ApplyFloor(Real floor_val); - MatrixIndexT ApplyCeiling(Real ceiling_val); + void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL); + void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count = NULL); void ApplyPow(Real power); Real Sum() const; diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc index efb83baf52e..4ae2550c364 100644 --- a/src/feat/feature-functions.cc +++ b/src/feat/feature-functions.cc @@ -321,7 +321,8 @@ void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum); // now "variance" is the variance of the features in the window, // around their own mean. - int32 num_floored = variance.ApplyFloor(1.0e-10); + int32 num_floored; + variance.ApplyFloor(1.0e-10, &num_floored); if (num_floored > 0 && num_frames > 1) { if (opts.max_warnings == warning_count) { KALDI_WARN << "Suppressing the remaining variance flooring " diff --git a/src/gmm/mle-diag-gmm.cc b/src/gmm/mle-diag-gmm.cc index bf2fcd5a0bd..48fc0d4d740 100644 --- a/src/gmm/mle-diag-gmm.cc +++ b/src/gmm/mle-diag-gmm.cc @@ -343,7 +343,7 @@ void MleDiagGmmUpdate(const MleDiagGmmOptions &config, if (config.variance_floor_vector.Dim() != 0) { floored = var.ApplyFloor(config.variance_floor_vector); } else { - floored = var.ApplyFloor(config.min_variance); + var.ApplyFloor(config.min_variance, &floored); } if (floored != 0) { elements_floored += floored; diff --git a/src/ivector/ivector-extractor.cc b/src/ivector/ivector-extractor.cc index 44393e79879..aaba3837698 100644 --- a/src/ivector/ivector-extractor.cc +++ b/src/ivector/ivector-extractor.cc @@ -348,7 +348,8 @@ static double GetLogDetNoFailure(const SpMatrix &var) { } catch (...) { Vector eigs(var.NumRows()); var.Eig(&eigs); - int32 floored = eigs.ApplyFloor(1.0e-20); + int32 floored; + eigs.ApplyFloor(1.0e-20, &floored); if (floored > 0) KALDI_WARN << "Floored " << floored << " eigenvalues of variance."; eigs.ApplyLog(); @@ -1579,7 +1580,8 @@ double IvectorExtractorStats::UpdatePrior( covar.Eig(&s, &P); KALDI_LOG << "Eigenvalues of iVector covariance range from " << s.Min() << " to " << s.Max(); - int32 num_floored = s.ApplyFloor(1.0e-07); + int32 num_floored; + s.ApplyFloor(1.0e-07, &num_floored); if (num_floored > 0) KALDI_WARN << "Floored " << num_floored << " eigenvalues of covar " << "of iVectors."; diff --git a/src/ivector/plda.cc b/src/ivector/plda.cc index 748d6e8d502..d14d392e2f5 100644 --- a/src/ivector/plda.cc +++ b/src/ivector/plda.cc @@ -488,7 +488,8 @@ void PldaEstimator::GetOutput(Plda *plda) { between_var_proj.Eig(&s, &U); KALDI_ASSERT(s.Min() >= 0.0); - int32 n = s.ApplyFloor(0.0); + int32 n; + s.ApplyFloor(0.0, &n); if (n > 0) { KALDI_WARN << "Floored " << n << " eigenvalues of between-class " << "variance to zero."; diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc index d3d3de47013..8cedc9c0487 100644 --- a/src/matrix/kaldi-vector.cc +++ b/src/matrix/kaldi-vector.cc @@ -5,6 +5,7 @@ // Petr Schwarz; Yanmin Qian; Jan Silovsky; // Haihua Xu; Wei Shi // 2015 Guoguo Chen +// 2017 Daniel Galvez // See ../../COPYING for clarification regarding multiple authors @@ -811,30 +812,41 @@ void VectorBase::ApplyAbs() { } template -MatrixIndexT VectorBase::ApplyFloor(Real floor_val) { - MatrixIndexT num_floored = 0; - for (MatrixIndexT i = 0; i < dim_; i++) { - if (data_[i] < floor_val) { - data_[i] = floor_val; - num_floored++; +void VectorBase::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) { + if (floored_count == nullptr) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = std::max(data_[i], floor_val); + } + } else { + MatrixIndexT num_floored = 0; + for (MatrixIndexT i = 0; i < dim_; i++) { + if (data_[i] < floor_val) { + data_[i] = floor_val; + num_floored++; + } } + *floored_count = num_floored; } - return num_floored; } template -MatrixIndexT VectorBase::ApplyCeiling(Real ceil_val) { - MatrixIndexT num_changed = 0; - for (MatrixIndexT i = 0; i < dim_; i++) { - if (data_[i] > ceil_val) { - data_[i] = ceil_val; - num_changed++; +void VectorBase::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) { + if (ceiled_count == nullptr) { + for (MatrixIndexT i = 0; i < dim_; i++) { + data_[i] = std::min(data_[i], ceil_val); + } + } else { + MatrixIndexT num_changed = 0; + for (MatrixIndexT i = 0; i < dim_; i++) { + if (data_[i] > ceil_val) { + data_[i] = ceil_val; + num_changed++; + } } + *ceiled_count = num_changed; } - return num_changed; } - template MatrixIndexT VectorBase::ApplyFloor(const VectorBase &floor_vec) { KALDI_ASSERT(floor_vec.Dim() == dim_); diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h index dcfdd47b09c..3eb4a932095 100644 --- a/src/matrix/kaldi-vector.h +++ b/src/matrix/kaldi-vector.h @@ -6,6 +6,7 @@ // Karel Vesely; Go Vivace Inc.; Arnab Ghoshal // Wei Shi; // 2015 Guoguo Chen +// 2017 Daniel Galvez // See ../../COPYING for clarification regarding multiple authors // @@ -133,11 +134,13 @@ class VectorBase { /// Take absolute value of each of the elements void ApplyAbs(); - /// Applies floor to all elements. Returns number of elements floored. - MatrixIndexT ApplyFloor(Real floor_val); + /// Applies floor to all elements. Returns number of elements + /// floored in floored_count if it is non-null. + void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr); - /// Applies ceiling to all elements. Returns number of elements changed. - MatrixIndexT ApplyCeiling(Real ceil_val); + /// Applies ceiling to all elements. Returns number of elements + /// changed in ceiled_count if it is non-null. + void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr); /// Applies floor to all elements. Returns number of elements floored. MatrixIndexT ApplyFloor(const VectorBase &floor_vec); diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc index b97c70dbbdf..ef82ee7ed02 100644 --- a/src/matrix/matrix-lib-test.cc +++ b/src/matrix/matrix-lib-test.cc @@ -6,6 +6,7 @@ // Johns Hopkins University (Author: Daniel Povey); // Haihua Xu; Wei Shi // 2015 Guoguo Chen +// 2017 Daniel Galvez // See ../../COPYING for clarification regarding multiple authors // @@ -2286,8 +2287,10 @@ template static void UnitTestFloorCeiling() { v.SetRandn(); Real pivot = v(5); Vector f(v), f2(v), c(v), c2(v); - MatrixIndexT floored2 = f.ApplyFloor(pivot), - ceiled2 = c.ApplyCeiling(pivot); + MatrixIndexT floored2; + f.ApplyFloor(pivot, &floored2); + MatrixIndexT ceiled2; + c.ApplyCeiling(pivot, &ceiled2); MatrixIndexT floored = 0, ceiled = 0; for (MatrixIndexT d = 0; d < dimM; d++) { if (f2(d) < pivot) { f2(d) = pivot; floored++; } @@ -2297,6 +2300,16 @@ template static void UnitTestFloorCeiling() { AssertEqual(c, c2); KALDI_ASSERT(floored == floored2); KALDI_ASSERT(ceiled == ceiled2); + + // Check that the non-counting variants are equivalent to the counting + // variants. + Vector f3(v); + f3.ApplyFloor(pivot); + AssertEqual(f2, f3); + + Vector c3(v); + c3.ApplyCeiling(pivot); + AssertEqual(c2, c3); } } diff --git a/src/nnet2/get-feature-transform.cc b/src/nnet2/get-feature-transform.cc index 3f348d2de76..38ec9bc3da9 100644 --- a/src/nnet2/get-feature-transform.cc +++ b/src/nnet2/get-feature-transform.cc @@ -111,7 +111,8 @@ void FeatureTransformEstimate::EstimateInternal( Vector s(min_dim); M->Svd(&s, &U, &Vt); // decompose m = U diag(s) Vt. BaseFloat max_s = s.Max(); - int32 n = s.ApplyCeiling(opts.max_singular_value); + int32 n; + s.ApplyCeiling(opts.max_singular_value, &n); if (n > 0) { KALDI_LOG << "Applied ceiling to " << n << " out of " << s.Dim() << " singular values of transform using ceiling " diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc index 30f9a33ef5e..b0306db72a2 100644 --- a/src/nnet2/nnet-precondition-online-test.cc +++ b/src/nnet2/nnet-precondition-online-test.cc @@ -170,7 +170,8 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu( Z_t.Eig(&c_t, &U_t); SortSvd(&c_t, &U_t); double c_t_floor = pow(rho_t_ * (1.0 - eta), 2); - int32 nf = c_t.ApplyFloor(c_t_floor); + int32 nf; + c_t.ApplyFloor(c_t_floor, &nf); if (nf > 0) { KALDI_WARN << "Floored " << nf << " elements of c_t."; } @@ -198,7 +199,7 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu( KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1; rho_t1 = floor_val; } - nf = d_t1.ApplyFloor(floor_val); + d_t1.ApplyFloor(floor_val, &nf); if (nf > 0) { KALDI_VLOG(3) << "d_t1 was " << d_t1; KALDI_WARN << "Floored " << nf << " elements of d_{t+1}."; diff --git a/src/nnet2/nnet-precondition-online.cc b/src/nnet2/nnet-precondition-online.cc index 7154548f175..51e7c5b13c6 100644 --- a/src/nnet2/nnet-precondition-online.cc +++ b/src/nnet2/nnet-precondition-online.cc @@ -416,7 +416,8 @@ void OnlinePreconditioner::PreconditionDirectionsInternal( bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1)); BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2); - int32 nf = c_t.ApplyFloor(c_t_floor); + int32 nf; + c_t.ApplyFloor(c_t_floor, &nf); if (nf > 0) must_reorthogonalize = true; if (nf > 0 && self_debug_) { diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc index 2829d4ebde7..445cc43f868 100644 --- a/src/nnet3/natural-gradient-online-test.cc +++ b/src/nnet3/natural-gradient-online-test.cc @@ -170,7 +170,8 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu( Z_t.Eig(&c_t, &U_t); SortSvd(&c_t, &U_t); double c_t_floor = pow(rho_t_ * (1.0 - eta), 2); - int32 nf = c_t.ApplyFloor(c_t_floor); + int32 nf; + c_t.ApplyFloor(c_t_floor, &nf); if (nf > 0) { KALDI_WARN << "Floored " << nf << " elements of c_t."; } @@ -198,7 +199,7 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu( KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1; rho_t1 = floor_val; } - nf = d_t1.ApplyFloor(floor_val); + d_t1.ApplyFloor(floor_val, &nf); if (nf > 0) { KALDI_VLOG(3) << "d_t1 was " << d_t1; KALDI_WARN << "Floored " << nf << " elements of d_{t+1}."; diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index 19a7d5fafdc..b5740053f46 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -406,7 +406,8 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1)); BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2); - int32 nf = c_t.ApplyFloor(c_t_floor); + int32 nf; + c_t.ApplyFloor(c_t_floor, &nf); if (nf > 0) must_reorthogonalize = true; if (nf > 0 && self_debug_) { diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index bc7405f2836..dd6e950a7d1 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1127,7 +1127,8 @@ void BackpropTruncationComponent::Backprop(const std::string &debug_info, kNoTrans, 0.0); // now clipping_scales contains the squared (norm of each row divided by // clipping_threshold) - int32 num_not_scaled = clipping_scales.ApplyFloor(1.0); + int32 num_not_scaled; + clipping_scales.ApplyFloor(1.0, &num_not_scaled); // now clipping_scales contains min(1, squared-(norm/clipping_threshold)) clipping_scales.ApplyPow(-0.5); // now clipping_scales contains max(1, clipping_threshold/vector_norm) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 5bd6ffeee32..ea5a2489bc4 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -617,7 +617,8 @@ void ClipGradientComponent::Backprop(const std::string &debug_info, kNoTrans, 0.0); // now clipping_scales contains the squared (norm of each row divided by // clipping_threshold) - int32 num_not_scaled = clipping_scales.ApplyFloor(1.0); + int32 num_not_scaled; + clipping_scales.ApplyFloor(1.0, &num_not_scaled); // now clipping_scales contains min(1, // squared-(norm/clipping_threshold)) if (num_not_scaled != clipping_scales.Dim()) { diff --git a/src/sgmm2/am-sgmm2.cc b/src/sgmm2/am-sgmm2.cc index 86623a12ca2..d249a5ab8b2 100644 --- a/src/sgmm2/am-sgmm2.cc +++ b/src/sgmm2/am-sgmm2.cc @@ -1045,7 +1045,8 @@ void AmSgmm2::ComputeFmllrPreXform(const Vector &state_occs, tmpB.Eig(diag_mean_scatter, &U); // Eq. (B.5): B = U D V^T int32 n; - if ((n = diag_mean_scatter->ApplyFloor(1.0e-04)) != 0) + diag_mean_scatter->ApplyFloor(1.0e-04, &n); + if (n != 0) KALDI_WARN << "Floored " << n << " elements of the mean-scatter matrix."; // Eq. (B.6): A_{pre} = U^T * L^{-1} From 47f9cc1c61da7b7e1f969006dda1b3c84c31f137 Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Sun, 31 Dec 2017 12:59:23 -0800 Subject: [PATCH 048/184] [scripts] Fig bug in validate_data_dir.sh introduced in df7a41978f2. Thx:@jcsilva --- egs/wsj/s5/utils/validate_data_dir.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index 11f1db806b3..dbbaeb10d5d 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -164,8 +164,8 @@ if [ -f $data/wav.scp ]; then if [ -f $data/text ]; then ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts"; - exit 1 + echo "$0: Lengths are $segments_len vs $num_utts" && \ + exit 1 fi cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings From d884c01da8f2e51cfe383f85432f40b645618163 Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Sun, 31 Dec 2017 13:05:56 -0800 Subject: [PATCH 049/184] [scripts] Fix lattice_oracle_align.sh bug (Thx: @roman-verbit-ai) --- egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh index 8cac7263d78..29d52588807 100755 --- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh +++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh @@ -74,7 +74,7 @@ oov=$(cat $lang/oov.int) utils/split_data.sh --per-utt $data $nj -sdata=$data/split$nj +sdata=$data/split${nj}utt if [ $stage -le 1 ]; then $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \ From 3e57faafb20344e24399e8bff0a58af39e69109c Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Mon, 1 Jan 2018 12:27:56 -0800 Subject: [PATCH 050/184] [build] Add new search dir for ATLAS (Thx: Sari Sultan) --- src/configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/configure b/src/configure index bec077b4a92..fc07cb2fc70 100755 --- a/src/configure +++ b/src/configure @@ -748,7 +748,7 @@ function linux_check_dynamic { function linux_configure_dynamic { if $threaded_atlas; then pt=t; else pt=s; fi # relevant to "fat" libraries, will change later for separate ones if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below. - for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3} \ + for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3,/x86_64-linux-gnu} \ `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do linux_check_dynamic && ATLASLIBDIR=$dir && ATLASLIBNAME=$atlas_libname done From 30b623f84def68cdaecd4cc4367e54eba1e38301 Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Tue, 2 Jan 2018 12:25:33 -0800 Subject: [PATCH 051/184] [scripts] Fix script issue affecting some xvector training (thanks: daniel garcia-romero) --- egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 9f9b5752ce6..72b776351f6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -531,7 +531,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, --verbose=3 {raw_models} \ "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ - nnet3-merge-egs --minibatch-size={mbsize} ark:- ark:- |" \ + nnet3-merge-egs --minibatch-size=1:{mbsize} ark:- ark:- |" \ "{out_model}" """.format(command=run_opts.command, combine_queue_opt=run_opts.combine_queue_opt, From 51abf1c617254b013bdf4fe200ec86e616095d20 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Thu, 4 Jan 2018 04:58:20 +0800 Subject: [PATCH 052/184] [egs] Improve comments for fisher_swbd tdnn_lstm_1a (#2122) --- egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh | 2 ++ egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh | 11 ++++++++--- egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh index e1df22ede91..9810a03ee58 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng +# Apache 2.0 # The model training procedure is similar to run_blstm_6j.sh under egs/swbd/s5c diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh index 81af4a128e8..d057470552f 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh @@ -1,5 +1,10 @@ #!/bin/bash -# same as run_tdnn_opgru_1a.sh, but replacing Norm-OPGRU with LSTMP. +# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng +# Apache 2.0 + +# Same as run_tdnn_opgru_1a.sh, but replacing Norm-OPGRU with LSTMP. +# Also Batchnorm in TDNN layers does not reduce the WER in Fisher+SWBD, so in run_tdnn_lstm_1a.sh, +# I just apply renorm component in TDNN layers. # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp # System tdnn_lstm_1a_sp # WER on eval2000(tg) 12.3 @@ -15,7 +20,7 @@ # Final train prob (xent) -0.882 # Final valid prob (xent) -0.9393 -# ./show_chain_wer.sh tdnn_lstm_1b_sp +# ./show_chain_wer.sh tdnn_lstm_1a_sp # %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys # %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys # %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys @@ -23,7 +28,7 @@ # %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys # %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys -# ./show_chain_wer_rt03.sh tdnn_lstm_1b_sp +# ./show_chain_wer_rt03.sh tdnn_lstm_1a_sp # %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys # %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys # %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh index e819a987a48..2de8d774451 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng +# Apache 2.0 # This is based on TDNN_LSTM_1b (from egs/swbd/s5c), but using the NormOPGRU to replace the LSTMP, # and adding chunk-{left,right}-context-initial=0 From 1a383decc5a6c99dd94a13579e95753aff5a499b Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Fri, 5 Jan 2018 00:59:47 +0330 Subject: [PATCH 053/184] [egs] Add OCR/Handwriting Recognition examples (#1984) * OCR: Add IAM corpus with unk decoding support (#3) * Add a new English OCR database 'UW3' * Some minor fixes re IAM corpus * Fix an issue in IAM chain recipes + add a new recipe (#6) * Some fixes based on the pull request review * Various fixes + cleaning on IAM * Fix LM estimation and add extended dictionary + other minor fixes * Add README for IAM * Add output filter for scoring * Fix a bug RE switch to pyhton3 * Add updated results + minor fixes * Remove unk decoding -- gives almost no gain * Add UW3 OCR database * Fix cmd.sh in IAM + fix usages of train/decode_cmd in chain recipes * Various minor fixes on UW3 * Rename iam/s5 to iam/v1 * Add README file for UW3 * Various cosmetic fixes on UW3 scripts * Minor fixes in IAM --- egs/iam/README.txt | 4 + egs/iam/v1/cmd.sh | 13 + egs/iam/v1/image | 1 + egs/iam/v1/local/chain/compare_wer.sh | 59 +++++ egs/iam/v1/local/chain/run_cnn_1a.sh | 235 +++++++++++++++++ egs/iam/v1/local/chain/run_cnn_chainali_1a.sh | 244 +++++++++++++++++ egs/iam/v1/local/chain/run_cnn_chainali_1b.sh | 245 ++++++++++++++++++ egs/iam/v1/local/make_features.py | 87 +++++++ egs/iam/v1/local/prepare_data.sh | 149 +++++++++++ egs/iam/v1/local/prepare_dict.sh | 49 ++++ egs/iam/v1/local/process_data.py | 82 ++++++ egs/iam/v1/local/score.sh | 5 + egs/iam/v1/local/train_lm.sh | 139 ++++++++++ egs/iam/v1/local/wer_output_filter | 27 ++ egs/iam/v1/path.sh | 6 + egs/iam/v1/run.sh | 122 +++++++++ egs/iam/v1/steps | 1 + egs/iam/v1/utils | 1 + egs/uw3/README.txt | 4 + egs/uw3/v1/cmd.sh | 13 + egs/uw3/v1/image | 1 + egs/uw3/v1/local/chain/compare_wer.sh | 72 +++++ egs/uw3/v1/local/chain/run_cnn_1a.sh | 234 +++++++++++++++++ egs/uw3/v1/local/make_features.py | 97 +++++++ egs/uw3/v1/local/prepare_data.sh | 40 +++ egs/uw3/v1/local/prepare_dict.sh | 29 +++ egs/uw3/v1/local/process_data.py | 61 +++++ egs/uw3/v1/local/score.sh | 156 +++++++++++ egs/uw3/v1/local/train_lm.sh | 102 ++++++++ .../v1/local/unk_arc_post_to_transcription.py | 86 ++++++ egs/uw3/v1/path.sh | 6 + egs/uw3/v1/run.sh | 106 ++++++++ egs/uw3/v1/steps | 1 + egs/uw3/v1/utils | 1 + 34 files changed, 2478 insertions(+) create mode 100644 egs/iam/README.txt create mode 100644 egs/iam/v1/cmd.sh create mode 120000 egs/iam/v1/image create mode 100755 egs/iam/v1/local/chain/compare_wer.sh create mode 100755 egs/iam/v1/local/chain/run_cnn_1a.sh create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1a.sh create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1b.sh create mode 100755 egs/iam/v1/local/make_features.py create mode 100755 egs/iam/v1/local/prepare_data.sh create mode 100755 egs/iam/v1/local/prepare_dict.sh create mode 100755 egs/iam/v1/local/process_data.py create mode 100755 egs/iam/v1/local/score.sh create mode 100755 egs/iam/v1/local/train_lm.sh create mode 100755 egs/iam/v1/local/wer_output_filter create mode 100755 egs/iam/v1/path.sh create mode 100755 egs/iam/v1/run.sh create mode 120000 egs/iam/v1/steps create mode 120000 egs/iam/v1/utils create mode 100644 egs/uw3/README.txt create mode 100644 egs/uw3/v1/cmd.sh create mode 120000 egs/uw3/v1/image create mode 100755 egs/uw3/v1/local/chain/compare_wer.sh create mode 100755 egs/uw3/v1/local/chain/run_cnn_1a.sh create mode 100755 egs/uw3/v1/local/make_features.py create mode 100755 egs/uw3/v1/local/prepare_data.sh create mode 100755 egs/uw3/v1/local/prepare_dict.sh create mode 100755 egs/uw3/v1/local/process_data.py create mode 100755 egs/uw3/v1/local/score.sh create mode 100755 egs/uw3/v1/local/train_lm.sh create mode 100755 egs/uw3/v1/local/unk_arc_post_to_transcription.py create mode 100755 egs/uw3/v1/path.sh create mode 100755 egs/uw3/v1/run.sh create mode 120000 egs/uw3/v1/steps create mode 120000 egs/uw3/v1/utils diff --git a/egs/iam/README.txt b/egs/iam/README.txt new file mode 100644 index 00000000000..daeb67af541 --- /dev/null +++ b/egs/iam/README.txt @@ -0,0 +1,4 @@ + +This directory contains example scripts for handwriting recognition on +the IAM dataset: +http://www.fki.inf.unibe.ch/databases/iam-handwriting-database diff --git a/egs/iam/v1/cmd.sh b/egs/iam/v1/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/iam/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/iam/v1/image b/egs/iam/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/iam/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..4eb665fc702 --- /dev/null +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..3b1571091c1 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_1a.sh @@ -0,0 +1,235 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) + +# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_* +# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0 +# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ + data/$lang_test $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh new file mode 100755 index 00000000000..2c8b6c91e5a --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments + +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/ +# System cnn_chainali_1a cnn_1a +# WER 6.69 9.13 +# Final train prob -0.0128 -0.0297 +# Final valid prob -0.0447 -0.0975 +# Final train prob (xent) -0.6448 -0.5915 +# Final valid prob (xent) -0.9924 -1.0022 + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/ +# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045) + +# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_* +# %WER 3.94 [ 2600 / 65921, 549 ins, 837 del, 1214 sub ] exp/chain/cnn_chainali_1a/decode_test/cer_15_0.0 +# %WER 6.69 [ 1241 / 18542, 135 ins, 358 del, 748 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_15_0.5 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh new file mode 100755 index 00000000000..ddf596a6126 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/ +# System cnn_chainali_1a cnn_chainali_1b +# WER 6.69 6.25 +# Final train prob -0.0132 -0.0041 +# Final valid prob -0.0509 -0.0337 +# Final train prob (xent) -0.6393 -0.6287 +# Final valid prob (xent) -1.0116 -0.9064 + +# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ +# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) + +# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_* +# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0 +# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py new file mode 100755 index 00000000000..b998464953f --- /dev/null +++ b/egs/iam/v1/local/make_features.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + padding_x = args.padding + padding_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((padding_y, padding_x), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x), + dtype=int)), axis=1) + return im_pad1 + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scale = get_scaled_image(im) + + data = np.transpose(im_scale, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh new file mode 100755 index 00000000000..1350c5841df --- /dev/null +++ b/egs/iam/v1/local/prepare_data.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the IAM handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 + +stage=0 +download_dir=data/download +username= +password= # username and password for downloading the IAM database + # if you have not already downloaded the database, please + # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database + # and provide this script with your username and password. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then + echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files" + echo "exist in your data/local directory this script will fail because the required files" + echo "can't be downloaded automatically (it needs registration)." + echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" + echo "... and then call this script again with --username --password " + echo "" +fi + +lines=data/local/lines +xml=data/local/xml +ascii=data/local/ascii +bcorpus=data/local/browncorpus +lobcorpus=data/local/lobcorpus +data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask +lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz +xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz +data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip +ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz +brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt +lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +mkdir -p $download_dir data/local + +# download and extact images and transcription +if [ -d $lines ]; then + echo "$0: Not downloading lines images as it is already there." +else + if [ ! -f $download_dir/lines.tgz ]; then + echo "$0: Trying to download lines images..." + wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1; + fi + mkdir -p $lines + tar -xzf $download_dir/lines.tgz -C $lines || exit 1; + echo "$0: Done downloading and extracting lines images" +fi + +if [ -d $xml ]; then + echo "$0: Not downloading transcriptions as it is already there." +else + if [ ! -f $download_dir/xml.tgz ]; then + echo "$0: Trying to download transcriptions..." + wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1; + fi + mkdir -p $xml + tar -xzf $download_dir/xml.tgz -C $xml || exit 1; + echo "$0: Done downloading and extracting transcriptions." +fi + +if [ -d $data_split_info ]; then + echo "$0: Not downloading data split information as it is already there." +else + if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then + echo "$0: Trying to download training and testing data split information..." + wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1; + fi + mkdir -p $data_split_info + unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; + echo "$0: Done downloading and extracting training and testing data split information" +fi + +if [ -d $ascii ]; then + echo "$0: Not downloading ascii.tgz as it is already there." +else + if [ ! -f $download_dir/ascii.tgz ]; then + echo "$0: trying to download ascii.tgz..." + wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1; + fi + mkdir -p $ascii + tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1; + echo "$0: Done downloading and extracting ascii.tgz" +fi + +if [ -d $lobcorpus ]; then + echo "$0: Not downloading the LOB text corpus as it is already there." +else + if [ ! -f $lobcorpus/0167.zip ]; then + echo "$0: Downloading the LOB text corpus ..." + mkdir -p $lobcorpus + wget -P $lobcorpus/ $lob_corpus_url || exit 1; + fi + unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1; + echo "$0: Done downloading and extracting LOB corpus" +fi + +if [ -d $bcorpus ]; then + echo "$0: Not downloading the Brown corpus as it is already there." +else + if [ ! -f $bcorpus/brown.txt ]; then + mkdir -p $bcorpus + echo "$0: Downloading the Brown text corpus..." + wget -P $bcorpus $brown_corpus_url || exit 1; + fi + echo "$0: Done downloading the Brown text corpus" +fi + +mkdir -p data/{train,test,val} +file_name=largeWriterIndependentTextLineRecognitionTask + +train_old="data/local/$file_name/trainset.txt" +test_old="data/local/$file_name/testset.txt" +val1_old="data/local/$file_name/validationset1.txt" +val2_old="data/local/$file_name/validationset2.txt" + +train_new="data/local/train.uttlist" +test_new="data/local/test.uttlist" +val_new="data/local/validation.uttlist" + +cat $train_old > $train_new +cat $test_old > $test_new +cat $val1_old $val2_old > $val_new + +if [ $stage -le 0 ]; then + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 + + utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +fi diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..77a46df384f --- /dev/null +++ b/egs/iam/v1/local/prepare_dict.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +mkdir -p $dir + +# First get the set of all letters that occur in data/train/text +cat data/train/text | \ + perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ + sort -u > $dir/nonsilence_phones.txt + +# Now list all the unique words (that use only the above letters) +# in data/train/text and LOB+Brown corpora with their comprising +# letters as their transcription. (Letter # is replaced with ) + +export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") + +cut -d' ' -f2- data/train/text | \ + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \ + data/local/browncorpus/brown.txt - | \ + perl -e '$letters=$ENV{letters}; +while(<>){ @A = split; + foreach(@A) { + if(! $seen{$_} && $_ =~ m/^[$letters]+$/){ + $seen{$_} = 1; + $trans = join(" ", split(//)); + $trans =~ s/#//g; + print "$_ $trans\n"; + } + } +}' | sort > $dir/lexicon.txt + + +sed -i "s/#//" $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/iam/v1/local/process_data.py b/egs/iam/v1/local/process_data.py new file mode 100755 index 00000000000..fa5eb484707 --- /dev/null +++ b/egs/iam/v1/local/process_data.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_data.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('out_dir', type=str, + help='Where to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.database_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) + + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh new file mode 100755 index 00000000000..31564d25326 --- /dev/null +++ b/egs/iam/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh new file mode 100755 index 00000000000..aa4303d6a28 --- /dev/null +++ b/egs/iam/v1/local/train_lm.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains an LM on the LOB+Brown text data and IAM training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using LOB and brown corpus. + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt + cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/iam.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from IAM text + cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 iam=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_big was -5.06654404785 per word [perplexity = 158.625177948] over 19477.0 words + # current results, after adding --limit-unk-history=true: + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_small was -5.24719139498 per word [perplexity = 190.031793995] over 19477.0 words + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/iam/v1/local/wer_output_filter b/egs/iam/v1/local/wer_output_filter new file mode 100755 index 00000000000..162482539ed --- /dev/null +++ b/egs/iam/v1/local/wer_output_filter @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ' '.join(words[1:]) + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/iam/v1/path.sh b/egs/iam/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/iam/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh new file mode 100755 index 00000000000..c8ebb9ae649 --- /dev/null +++ b/egs/iam/v1/run.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + +set -e +stage=0 +nj=20 + +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ + data/lang exp/mono +fi + +if [ $stage -le 5 ]; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ + exp/mono/decode_test +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \ + exp/mono_ali exp/tri +fi + +if [ $stage -le 7 ]; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ + exp/tri/decode_test +fi + +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/tri exp/tri_ali + + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/tri_ali exp/tri2 +fi + +if [ $stage -le 9 ]; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ + data/test exp/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd $cmd 500 20000 \ + data/train data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + + steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ + data/test exp/tri3/decode_test +fi + +if [ $stage -le 12 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 13 ]; then + local/chain/run_cnn_1a.sh +fi + +if [ $stage -le 14 ]; then + local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2 +fi diff --git a/egs/iam/v1/steps b/egs/iam/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iam/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iam/v1/utils b/egs/iam/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iam/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/uw3/README.txt b/egs/uw3/README.txt new file mode 100644 index 00000000000..b02d00ff541 --- /dev/null +++ b/egs/uw3/README.txt @@ -0,0 +1,4 @@ + +This directory contains example scripts for optical character recognition +(i.e. OCR) on the UW3 dataset (it's a printed English OCR corpus): +http://isis-data.science.uva.nl/events/dlia//datasets/uwash3.html diff --git a/egs/uw3/v1/cmd.sh b/egs/uw3/v1/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/uw3/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/uw3/v1/image b/egs/uw3/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/uw3/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/uw3/v1/local/chain/compare_wer.sh b/egs/uw3/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..1a40523355a --- /dev/null +++ b/egs/uw3/v1/local/chain/compare_wer.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chain/cnn1a +# System cnn1a +# WER 0.61 +# CER 0.15 +# Final train prob -0.0377 +# Final valid prob -0.0380 +# Final train prob (xent) -0.0830 +# Final valid prob (xent) -0.0838 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..ad7421e1261 --- /dev/null +++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# steps/info/chain_dir_info.pl exp/chain/cnn1a/ +# exp/chain/cnn1a/: num-iters=153 nj=3..10 num-params=3.6M dim=40->268 combine=-0.034->-0.034 xent:train/valid[101,152,final]=(-0.097,-0.186,-0.092/-0.101,-0.212,-0.098) logprob:train/valid[101,152,final]=(-0.035,-0.067,-0.035/-0.036,-0.082,-0.035) + +# cat exp/chain/cnn1a/decode_test/scoring_kaldi/best_* +# %WER 0.19 [ 366 / 188135, 110 ins, 123 del, 133 sub ] exp/chain/cnn1a/decode_test/cer_7_0.5 +# %WER 1.00 [ 357 / 35571, 104 ins, 26 del, 227 sub ] exp/chain/cnn1a/decode_test/wer_5_1.0 + + +set -e -o pipefail + +stage=0 +nj=30 + +# affix for exp dirs, e.g. it was _cleaned in tedlium. +nnet3_affix= + +affix=1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=5 + +# training chunk-options +chunk_width=340,300,200,100 + +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=false + +gmm_dir=exp/tri2 +ali_dir=exp/tri2_ali +lat_dir=exp/chain${nnet3_affix}/tri2_train_lats +dir=exp/chain${nnet3_affix}/cnn${affix} +train_data_dir=data/train +lores_train_data_dir=$train_data_dir # for the start, use the same data for gmm and chain +gmm_lang=data/lang +lang_test=data/lang_unk +tree_dir=exp/chain${nnet3_affix}/tree${affix} + +# the 'lang' directory is created by this script. +# If you create such a directory with a non-standard topology +# you should probably name it differently. +lang=data/lang_chain + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${lores_train_data_dir} \ + $lang_test $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" 300 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=12" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-2,-1,0,1,2 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=40 time-offsets=-2,-1,0,1,2 $common1 + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=450 + relu-batchnorm-layer name=tdnn2 input=Append(-5,0,5) dim=450 + relu-batchnorm-layer name=tdnn3 input=Append(-5,0,5) dim=450 + relu-batchnorm-layer name=tdnn4 input=Append(-5,0,5) dim=450 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=450 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=450 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/uw3/v1/local/make_features.py b/egs/uw3/v1/local/make_features.py new file mode 100755 index 00000000000..dd0a30a19d7 --- /dev/null +++ b/egs/uw3/v1/local/make_features.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc +from scipy import ndimage + +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, help='data directory (should contain images.scp)') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file.') +parser.add_argument('--feat-dim', type=int, default=40, + help='size to scale the height of all images (i.e. the dimension of the resulting features)') +parser.add_argument('--pad', type=bool, default=False, help='pad the left and right of the images with 10 white pixels.') + +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + # Some Images are rotated + if sy > sx: + im = np.rot90(im, k = -1) + sx = im.shape[1] + sy = im.shape[0] + + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + + noise = np.random.normal(2, 1,(nx, ny)) + im = im - noise + + return im + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + + im = misc.imread(image_path, flatten = True) + im_scale = get_scaled_image(im) + + if args.pad: + pad = np.ones((args.feat_dim, 10)) * 255 + im_data = np.hstack((pad, im_scale, pad)) + else: + im_data = im_scale + + data = np.transpose(im_data, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/uw3/v1/local/prepare_data.sh b/egs/uw3/v1/local/prepare_data.sh new file mode 100755 index 00000000000..47f62e4335a --- /dev/null +++ b/egs/uw3/v1/local/prepare_data.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang + +# This script downloads the UW3 dataset (if not already downloaded) +# and prepares the "train" and "test" data subsets. + +set -e +download_dir=data/download + +. ./cmd.sh +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +# Download dir +download_url=http://www.tmbdev.net/ocrdata/uw3-lines-book.tgz +data_dir=data/local/extracted_corpus + +mkdir -p $download_dir +mkdir -p $data_dir + +if [ -d $data_dir/book ]; then + echo "$0: Not downloading dataset as it is already downloaded." +else + if [ ! -f $download_dir/uw3-lines-book.tgz ]; then + echo "$0: Downloading dataset..." + wget -P $download_dir $download_url || exit 1; + fi + echo "$0: Extracting..." + tar -xzf $download_dir/uw3-lines-book.tgz -C $data_dir/ || exit 1; + echo "$0: Done downloading/extracting the datset." +fi + +mkdir -p data/train +mkdir -p data/test +echo "$0: Preparing the test and train subsets..." +local/process_data.py $data_dir/book data || exit 1 + +utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt +utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt diff --git a/egs/uw3/v1/local/prepare_dict.sh b/egs/uw3/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..72c9b50e5ec --- /dev/null +++ b/egs/uw3/v1/local/prepare_dict.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Hossein Hadian + +set -e +dir=data/local/dict + +mkdir -p $dir + +cut -d' ' -f2- data/train/text | tr -cs '[a-z][A-Z][0-9][:punct:]' '\n' | sort -u | \ + awk '{len=split($0,chars,""); printf($0); + for (i=0;i<=len;i++) { + if(chars[i]=="#") {chars[i]=""} + printf(chars[i]" ") + }; + printf("\n")};' | \ + sed 's/.$//' > $dir/lexicon.txt; + +cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL > $dir/optional_silence.txt + +echo -n "" > $dir/extra_questions.txt diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py new file mode 100755 index 00000000000..f5b37b04c2f --- /dev/null +++ b/egs/uw3/v1/local/process_data.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang + +# This script goes through the downloaded UW3 dataset and creates data files "text", +# "utt2spk", and "images.scp" for the train and test subsets in data/train and data/test. + +# text - matches the transcriptions with the image id +# utt2spk - matches the image id's with the speaker/writer names +# images.scp - matches the image is's with the actual image file + +import argparse +import os +import random + +parser = argparse.ArgumentParser(description="""Creates data/train and data/test.""") +parser.add_argument('database_path', type=str, help='path to downloaded (and extracted) UW3 corpus') +parser.add_argument('out_dir', type=str, default='data', + help='where to create the train and test data directories') +args = parser.parse_args() + +### main ### +train_text_file = os.path.join(args.out_dir, 'train', 'text') +train_text_fh = open(train_text_file, 'w+') +train_utt2spk_file = os.path.join(args.out_dir, 'train', 'utt2spk') +train_utt2spk_fh = open(train_utt2spk_file, 'w+') +train_image_file = os.path.join(args.out_dir, 'train', 'images.scp') +train_image_fh = open(train_image_file, 'w+') + +test_text_file = os.path.join(args.out_dir, 'test', 'text') +test_text_fh = open(test_text_file, 'w+') +test_utt2spk_file = os.path.join(args.out_dir, 'test', 'utt2spk') +test_utt2spk_fh = open(test_utt2spk_file, 'w+') +test_image_file = os.path.join(args.out_dir, 'test', 'images.scp') +test_image_fh = open(test_image_file, 'w+') + +random.seed(0) +page_count = 0 +for page in sorted(os.listdir(args.database_path)): + page_path = os.path.join(args.database_path, page) + page_count = page_count + 1 + for line in sorted(os.listdir(page_path)): + if line.endswith('.txt'): + text_path = os.path.join(args.database_path, page, line) + image_name = line.split('.')[0] + image_path = os.path.join(args.database_path, page, image_name + '.png') + utt_id = page + '_' + image_name + gt_fh = open(text_path, 'r') + text = gt_fh.readlines()[0].strip() + + # The UW3 dataset doesn't have established training and testing splits + # The dataset is randomly split train 95% and test 5% + coin = random.randint(0, 20) + if coin >= 1: + train_text_fh.write(utt_id + ' ' + text + '\n') + train_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n') + train_image_fh.write(utt_id + ' ' + image_path + '\n') + elif coin < 1: + test_text_fh.write(utt_id + ' ' + text + '\n') + test_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n') + test_image_fh.write(utt_id + ' ' + image_path + '\n') diff --git a/egs/uw3/v1/local/score.sh b/egs/uw3/v1/local/score.sh new file mode 100755 index 00000000000..9ea4701a833 --- /dev/null +++ b/egs/uw3/v1/local/score.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 + +# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's +# using local/unk_arc_post_to_transcription.py and also it calls +# steps/scoring/score_kaldi_cer.sh at the end. + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=3 +max_lmwt=13 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \ + lattice-arc-post $model_path/final.mdl ark:- - \| \ + local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + tr '[:upper:]' '[:lower:]' \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt| tr '[:upper:]' '[:lower:]' |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/uw3/v1/local/train_lm.sh b/egs/uw3/v1/local/train_lm.sh new file mode 100755 index 00000000000..39eb051d273 --- /dev/null +++ b/egs/uw3/v1/local/train_lm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 +# +# +# This script trains an LM on the UW3 training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +. ./path.sh +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=4500 +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/uw3.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/uw3.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get wordlist + cat ${dir}/data/text/uw3.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 uw3=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=uw3 ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + # No need for pruning as the training data is quite small (total # of + # n-grams is 685k). Write the arpa: + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py new file mode 100755 index 00000000000..c86d35e4b8a --- /dev/null +++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +# Copyright 2017 Ashish Arora + +import argparse +import sys + +parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") +parser.add_argument('phones', type=str, help='phones and phonesID') +parser.add_argument('words', type=str, help='word and wordID') +parser.add_argument('unk', type=str, default='-', help='location of unk file') +parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +args = parser.parse_args() +### main ### +phone_fh = open(args.phones, 'r') +word_fh = open(args.words, 'r') +unk_fh = open(args.unk,'r') +if args.input_ark == '-': + input_fh = sys.stdin +else: + input_fh = open(args.input_ark,'r') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +phone_dict = dict()# stores phoneID and phone mapping +phone_data_vect = phone_fh.read().strip().split("\n") +for key_val in phone_data_vect: + key_val = key_val.split(" ") + phone_dict[key_val[1]] = key_val[0] +word_dict = dict() +word_data_vect = word_fh.read().strip().split("\n") +for key_val in word_data_vect: + key_val = key_val.split(" ") + word_dict[key_val[1]] = key_val[0] +unk_val = unk_fh.read().strip().split(" ")[0] + +utt_word_dict = dict() +utt_phone_dict = dict()# stores utteranceID and phoneID +unk_word_dict = dict() +count=0 +for line in input_fh: + line_vect = line.strip().split("\t") + if len(line_vect) < 6: + print "IndexError" + print line_vect + continue + uttID = line_vect[0] + word = line_vect[4] + phones = line_vect[5] + if uttID in utt_word_dict.keys(): + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + else: + count = 0 + utt_word_dict[uttID] = dict() + utt_phone_dict[uttID] = dict() + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + if word == unk_val: # get character sequence for unk + phone_key_vect = phones.split(" ") + phone_val_vect = list() + for pkey in phone_key_vect: + phone_val_vect.append(phone_dict[pkey]) + phone_2_word = list() + for phone_val in phone_val_vect: + phone_2_word.append(phone_val.split('_')[0]) + phone_2_word = ''.join(phone_2_word) + utt_word_dict[uttID][count] = phone_2_word + else: + if word == '0': + word_val = ' ' + else: + word_val = word_dict[word] + utt_word_dict[uttID][count] = word_val + count += 1 + +transcription = "" +for key in sorted(utt_word_dict.iterkeys()): + transcription = key + for index in sorted(utt_word_dict[key].iterkeys()): + value = utt_word_dict[key][index] + transcription = transcription + " " + value + out_fh.write(transcription + '\n') diff --git a/egs/uw3/v1/path.sh b/egs/uw3/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/uw3/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/uw3/v1/run.sh b/egs/uw3/v1/run.sh new file mode 100755 index 00000000000..68c51fa4690 --- /dev/null +++ b/egs/uw3/v1/run.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + + +set -e +stage=0 +nj=30 + +# This is the database path on the JHU grid. You may set this +# to data/download, in which case the script will automatically download +# the database: +uw3_database=/export/a10/corpora5/handwriting_ocr/UW3/ + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. + + +if [ $stage -le 0 ]; then + # Data preparation + local/prepare_data.sh --download-dir "$uw3_database" +fi + +mkdir -p data/{train,test}/data +if [ $stage -le 1 ]; then + echo "$0: Preparing feature files for the test and training data..." + for f in train test; do + local/make_features.py --feat-dim 40 --pad true data/$f | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$f/data/images.ark,data/$f/feats.scp || exit 1 + + steps/compute_cmvn_stats.sh data/$f || exit 1; + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test + + echo "$0: Preparing the unk model for open-vocab decoding..." + utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ + data/local/dict exp/unk_lang_model + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "" data/lang_unk/temp data/lang_unk + cp data/lang_test/G.fst data/lang_unk/G.fst +fi + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd \ + data/train data/lang exp/mono +fi + +if [ $stage -le 5 ]; then + steps/align_si.sh --nj $nj --cmd $cmd \ + data/train data/lang exp/mono exp/mono_ali + steps/train_deltas.sh --cmd $cmd 500 20000 \ + data/train data/lang exp/mono_ali exp/tri +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd \ + data/train data/lang exp/tri exp/tri_ali + steps/train_lda_mllt.sh --cmd $cmd --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/tri_ali exp/tri2 +fi + +if [ $stage -le 7 ]; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + steps/decode.sh --nj $nj --cmd $cmd \ + exp/mono/graph data/test exp/mono/decode_test +fi + +if [ $stage -le 8 ]; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + steps/decode.sh --nj $nj --cmd $cmd \ + exp/tri/graph data/test exp/tri/decode_test +fi + +if [ $stage -le 9 ]; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + steps/decode.sh --nj $nj --cmd $cmd \ + exp/tri2/graph data/test exp/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_si.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali +fi + +if [ $stage -le 11 ]; then + local/chain/run_cnn_1a.sh +fi diff --git a/egs/uw3/v1/steps b/egs/uw3/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/uw3/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/uw3/v1/utils b/egs/uw3/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/uw3/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From 8f79b01faf8a392d170ff11ddad0733d6cb38c0d Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Thu, 4 Jan 2018 22:54:45 -0800 Subject: [PATCH 054/184] [src,scripts] Remove BatchNormComponent 'power' option --- .../steps/libs/nnet3/xconfig/basic_layers.py | 7 +-- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 7 +-- src/nnet3/nnet-normalize-component.cc | 59 ++++++------------- src/nnet3/nnet-normalize-component.h | 10 +--- src/nnet3/nnet-test-utils.cc | 1 - 5 files changed, 24 insertions(+), 60 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index c59e4a6041e..0e27e6369e4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -674,7 +674,6 @@ def set_default_configs(self): 'bottleneck-dim': -1, 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, - 'batchnorm-power': -0.5, 'ng-affine-options': '', 'ng-linear-options': '', # only affects bottleneck layers. 'dropout-proportion': 0.5, # dropout-proportion only @@ -754,7 +753,6 @@ def _add_components(self, input_desc, input_dim, nonlinearities): output_dim = self.output_dim() self_repair_scale = self.config['self-repair-scale'] target_rms = self.config['target-rms'] - batchnorm_power = self.config['batchnorm-power'] affine_options = self.config['ng-affine-options'] for opt_name in [ 'max-change', 'learning-rate-factor', @@ -845,10 +843,9 @@ def _add_components(self, input_desc, input_dim, nonlinearities): elif nonlinearity == 'batchnorm': line = ('component name={0}.{1}' - ' type=BatchNormComponent dim={2}' - ' target-rms={3} power={4}' + ' type=BatchNormComponent dim={2} target-rms={3}' ''.format(self.name, nonlinearity, output_dim, - target_rms, batchnorm_power)) + target_rms)) elif nonlinearity == 'so': line = ('component name={0}.{1}' diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 85454795435..67537f574e4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -829,9 +829,6 @@ def set_default_configs(self): 'clipping-threshold': 30.0, 'zeroing-interval': 20, 'zeroing-threshold': 15.0, - # batchnorm-power is for what i'm going to call OverNorm, you can set it - # for example to -0.75. - 'batchnorm-power': -0.5, 'delay' : -1, 'lstm-nonlinearity-options' : ' max-change=0.75', # the recurrence scale is the scale on m_trunc, used in the @@ -945,8 +942,8 @@ def _generate_lstm_config(self): configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( name, 2 * cell_dim, bptrunc_str)) - configs.append("component name={0}.m_batchnorm type=BatchNormComponent power={1} dim={2} ".format( - name, self.config['batchnorm-power'], cell_dim)) + configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim)) configs.append("### Nodes for the components above.") configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc index e6be8210bb0..507a30d1aa1 100644 --- a/src/nnet3/nnet-normalize-component.cc +++ b/src/nnet3/nnet-normalize-component.cc @@ -234,8 +234,9 @@ void BatchNormComponent::ComputeDerived() { // of numerical roundoff. scale_.ApplyFloor(0.0); scale_.Add(epsilon_); - scale_.ApplyPow(power_); - // now scale_ = min(variance, epsilon)^power_ + BaseFloat power = -0.5; + scale_.ApplyPow(power); + // now scale_ = min(variance, epsilon)^power // next, multiply by the target RMS (normally 1.0). scale_.Scale(target_rms_); offset_.MulElements(scale_); @@ -253,7 +254,7 @@ void BatchNormComponent::Check() const { } BatchNormComponent::BatchNormComponent(const BatchNormComponent &other): - dim_(other.dim_), block_dim_(other.block_dim_), power_(other.power_), + dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_), target_rms_(other.target_rms_), test_mode_(other.test_mode_), count_(other.count_), stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) { @@ -267,7 +268,6 @@ std::string BatchNormComponent::Info() const { stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_ << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_ << ", count=" << count_ - << ", power=" << power_ << ", test-mode=" << (test_mode_ ? "true" : "false"); if (count_ > 0) { Vector mean(stats_sum_), var(stats_sumsq_); @@ -286,14 +286,12 @@ std::string BatchNormComponent::Info() const { void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { dim_ = -1; block_dim_ = -1; - power_ = -0.5; epsilon_ = 1.0e-03; target_rms_ = 1.0; test_mode_ = false; bool ok = cfl->GetValue("dim", &dim_); cfl->GetValue("block-dim", &block_dim_); cfl->GetValue("epsilon", &epsilon_); - cfl->GetValue("power", &power_); cfl->GetValue("target-rms", &target_rms_); cfl->GetValue("test-mode", &test_mode_); if (!ok || dim_ <= 0) { @@ -307,8 +305,6 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); - if (power_ >= 0 || power_ <= -1.0) - KALDI_ERR << "Power has invalid value " << power_; count_ = 0; stats_sum_.Resize(block_dim_); stats_sumsq_.Resize(block_dim_); @@ -393,13 +389,12 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) { BACKWARD PASS (recap): var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power} + .. which for power = -0.5, simplifies to: + var_deriv_mod = -1.0 / (target-rms^2) * (1/I \sum_i z'(i) z(i)) * scale x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod */ - - - void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { @@ -435,15 +430,16 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, mean.AddRowSumMat(1.0 / num_frames, in, 0.0); uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0); scale.CopyFromVec(uvar); + // by applying this scale at this point, we save a multiply later on. - BaseFloat var_scale = std::pow(target_rms_, 1.0 / power_); + BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_); scale.AddVecVec(-var_scale, mean, mean, var_scale); - // at this point, 'scale' contains just the variance (times target-rms^{-power}) + // at this point, 'scale' contains just the variance (times target-rms^{-2}). scale.ApplyFloor(0.0); scale.Add(var_scale * epsilon_); // Now 'scale' contains the variance floored to zero and then with epsilon - // added [both times target-rms^{-power}] - scale.ApplyPow(power_); + // added [both times 1/target-rms^2]. + scale.ApplyPow(-0.5); // now 'scale' is the actual scale we'll use. // the next command will do no work if out == in, for in-place propagation. @@ -509,18 +505,19 @@ void BatchNormComponent::Backprop( KALDI_ASSERT(out_value.NumRows() == num_frames); CuSubVector scale(memo->mean_uvar_scale, 2), - temp(memo->mean_uvar_scale, 4), var_deriv_mod(memo->mean_uvar_scale, 3), - scale_pow(memo->mean_uvar_scale, 4); + temp(memo->mean_uvar_scale, 4); // var_deriv_mod is going to contain: // 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power} + // which for power = -0.5 simplifies to: + // -1.0 / (target_rms * target_rms). // but for now we don't have the power of 'scale', we'll add that later. - BaseFloat coeff = 2.0 * power_ * std::pow(target_rms_, 1.0 / power_) / - num_frames; + BaseFloat coeff = -1.0 / (target_rms_ * target_rms_ * num_frames); + var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans, out_deriv, kNoTrans, 0.0); - + var_deriv_mod.MulElements(scale); temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0); // the following statement does no work if in_deriv and out_deriv are the @@ -533,19 +530,9 @@ void BatchNormComponent::Backprop( // At this point, *in_deriv contains // scale * (z'(i) - 1/I * \sum_i z'(i)) - // The next few lines complete the calculation of 'var_deriv_mod'; - // we delayed it because we were using 'temp', and 'scale_pow' - // uses the same memory. - if (power_ == -0.5) { - // we can simplify scale^{-(1+power)/power} to just 'scale'. - var_deriv_mod.MulElements(scale); - } else { - scale_pow.CopyFromVec(scale); - scale_pow.ApplyPow(-1.0 * (1.0 + power_) / power_); - var_deriv_mod.MulElements(scale_pow); - } in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, var_deriv_mod, 1.0); + // At this point, *in_deriv contains what we described in the comment // starting BATCHNORM_MATH as: // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i)) + z(i) var_deriv_mod @@ -602,12 +589,6 @@ void BatchNormComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &block_dim_); - if (PeekToken(is, binary) == 'P') { - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &power_); - } else { - power_ = -0.5; - } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &epsilon_); ExpectToken(is, binary, ""); @@ -635,10 +616,6 @@ void BatchNormComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, block_dim_); - if (power_ != -0.5) { - WriteToken(os, binary, ""); - WriteBasicType(os, binary, power_); - } WriteToken(os, binary, ""); WriteBasicType(os, binary, epsilon_); WriteToken(os, binary, ""); diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h index b10c3e4a60c..1806fe38493 100644 --- a/src/nnet3/nnet-normalize-component.h +++ b/src/nnet3/nnet-normalize-component.h @@ -232,8 +232,8 @@ class BatchNormComponent: public Component { // 'sum_sumsq_scale' is of dimension 5 by block_dim_: // Row 0 = mean = the mean of the rows of the input // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). - // Row 2 = scale = the scale of the renormalization, which is - // Rows 3 and 4 are used as a temporaries in Backprop. + // Row 2 = scale = the scale of the renormalization. + // Rows 3 and 4 are used as temporaries in Backprop. CuMatrix mean_uvar_scale; }; @@ -260,12 +260,6 @@ class BatchNormComponent: public Component { // always will in the new code in nnet-convolutional-component.h. int32 block_dim_; - - // This power determines the scale as a power of the variance... the default - // (-0.5) corresponds to regular BatchNorm, but you can set it to other - // values, like -0.25 or -0.4, for what we'll call "fractional BatchNorm" - BaseFloat power_; - // Used to avoid exact-zero variances, epsilon has the dimension of a // covariance. BaseFloat epsilon_; diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 472a02197e5..48a97df9ea1 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -1681,7 +1681,6 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " block-dim=" << block_dim << " target-rms=" << RandInt(1, 4) << " test-mode=" << (test_mode ? "true" : "false") - << " power=" << (-0.1 * RandInt(3, 5)) << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0"); break; } From d5660c54d8da285dfadf499249cf310e42cda841 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 5 Jan 2018 21:20:59 -0500 Subject: [PATCH 055/184] [egs] Add newly tuned mini-librispeech example with factored output layer --- .../s5/local/chain/run_tdnn.sh | 2 +- .../s5/local/chain/tuning/run_tdnn_1f.sh | 308 ++++++++++++++++++ 2 files changed, 309 insertions(+), 1 deletion(-) create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh index 75da1a0a553..cb5756188a4 120000 --- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1e.sh \ No newline at end of file +tuning/run_tdnn_1f.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh new file mode 100755 index 00000000000..67b8b56ea49 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh @@ -0,0 +1,308 @@ +#!/bin/bash + +# 1f is as 1e but a smaller model with various tuning changes, the most +# important of which is the 'bottleneck-dim' option for the last layer; +# also dimensions are reduced and we've removed the 'target-rms=0.5' options +# on the prefinal layers. +# +# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp +# System tdnn1e_sp tdnn1f_sp +#WER dev_clean_2 (tgsmall) 13.84 13.92 +# [online:] 13.82 14.01 +#WER dev_clean_2 (tglarge) 10.17 9.83 +# [online:] 10.25 9.96 +# Final train prob -0.0500 -0.0515 +# Final valid prob -0.0870 -0.0889 +# Final train prob (xent) -1.4168 -1.3739 +# Final valid prob (xent) -1.6861 -1.6125 +# Num-params 7553634 3976418 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp +# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.6M dim=40+100->2353 combine=-0.055->-0.055 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.47,-1.42/-1.95,-1.73,-1.69) logprob:train/valid[10,16,final]=(-0.066,-0.054,-0.050/-0.100,-0.091,-0.087) +# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.0M dim=40+100->2353 combine=-0.060->-0.059 (over 2) xent:train/valid[10,16,final]=(-1.64,-1.43,-1.37/-1.85,-1.66,-1.61) logprob:train/valid[10,16,final]=(-0.069,-0.057,-0.052/-0.104,-0.094,-0.089) + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1f # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.02 bottleneck-dim=192" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=384 + relu-batchnorm-layer name=tdnn2 $opts dim=384 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=384 + relu-batchnorm-layer name=tdnn4 $opts dim=384 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=384 + relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=384 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=384 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=384 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l Date: Sat, 6 Jan 2018 18:55:23 -0500 Subject: [PATCH 056/184] [egs] Improvement to mini-librispeech 1f example --- .../s5/local/chain/tuning/run_tdnn_1f.sh | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh index 67b8b56ea49..58852b61aa8 100755 --- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh @@ -1,25 +1,28 @@ #!/bin/bash + # 1f is as 1e but a smaller model with various tuning changes, the most # important of which is the 'bottleneck-dim' option for the last layer; # also dimensions are reduced and we've removed the 'target-rms=0.5' options # on the prefinal layers. # -# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp -# System tdnn1e_sp tdnn1f_sp -#WER dev_clean_2 (tgsmall) 13.84 13.92 -# [online:] 13.82 14.01 -#WER dev_clean_2 (tglarge) 10.17 9.83 -# [online:] 10.25 9.96 -# Final train prob -0.0500 -0.0515 -# Final valid prob -0.0870 -0.0889 -# Final train prob (xent) -1.4168 -1.3739 -# Final valid prob (xent) -1.6861 -1.6125 -# Num-params 7553634 3976418 - -# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp -# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.6M dim=40+100->2353 combine=-0.055->-0.055 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.47,-1.42/-1.95,-1.73,-1.69) logprob:train/valid[10,16,final]=(-0.066,-0.054,-0.050/-0.100,-0.091,-0.087) -# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.0M dim=40+100->2353 combine=-0.060->-0.059 (over 2) xent:train/valid[10,16,final]=(-1.64,-1.43,-1.37/-1.85,-1.66,-1.61) logprob:train/valid[10,16,final]=(-0.069,-0.057,-0.052/-0.104,-0.094,-0.089) +# local/chain/compare_wer.sh --online exp/chain/tdnn1{e,f}_sp 2>/dev/null +# local/chain/compare_wer.sh --online exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp +# System tdnn1e_sp tdnn1f7_sp +#WER dev_clean_2 (tgsmall) 14.11 13.91 +# [online:] 14.07 13.96 +#WER dev_clean_2 (tglarge) 10.15 9.95 +# [online:] 10.16 10.13 +# Final train prob -0.0503 -0.0508 +# Final valid prob -0.0887 -0.0917 +# Final train prob (xent) -1.4257 -1.3509 +# Final valid prob (xent) -1.6799 -1.5883 +# Num-params 7508490 4205322 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f7}_sp +# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.057->-0.057 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.46,-1.43/-1.94,-1.72,-1.68) logprob:train/valid[10,16,final]=(-0.067,-0.055,-0.050/-0.105,-0.095,-0.089) +# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 2) xent:train/valid[10,16,final]=(-1.60,-1.39,-1.35/-1.81,-1.64,-1.59) logprob:train/valid[10,16,final]=(-0.068,-0.056,-0.051/-0.104,-0.097,-0.092) # Set -e here so that we catch if any executable fails immediately @@ -175,7 +178,7 @@ if [ $stage -le 13 ]; then relu-batchnorm-layer name=tdnn5 $opts dim=384 relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3) relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3) - relu-batchnorm-layer name=tdnn8 $opts dim=384 input=Append(-6,-3,0) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain $opts dim=384 From 3fecceae3655e0f23a5c0a0b5d86e4b42057bbd9 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 8 Jan 2018 16:32:20 -0500 Subject: [PATCH 057/184] [doc] Documentation fix (thx: Denis Peskov) --- src/doc/lattices.dox | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/doc/lattices.dox b/src/doc/lattices.dox index 714d9de6f2e..0b222ec5f1a 100644 --- a/src/doc/lattices.dox +++ b/src/doc/lattices.dox @@ -264,8 +264,10 @@ has the same effect as calling that the normal OpenFst RemoveEps() and Determini \section lattices_generation Lattice generation -Currently, the only decoder that generates lattices is the class -LatticeSimpleDecoder, defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc. +Command-line decoding programs that have 'latgen' in their names generate lattices. +Currently most of these use LatticeFasterDecoder. For purposes of exposition we will +focus instead on LatticeSimpleDecoder, whose operation is simpler. +This is defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc. As the name suggests, LatticeSimpleDecoder is a lattice-generating decoder that is modified from SimpleDecoder. SimpleDecoder is a straightforwardly implemented Viterbi beam search algorithm with only a single tunable parameter: the pruning beam (see \ref decoders_simple). LatticeSimpleDecoder has From 6e6e6709217ebd9da297eb5dbc8366bae3571459 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 8 Jan 2018 20:43:28 -0500 Subject: [PATCH 058/184] [src] Fix to nnet-utils RE orthonormal-constraint --- egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh | 340 +++++++++++++++++++ src/nnet3/nnet-utils.cc | 10 +- 2 files changed, 347 insertions(+), 3 deletions(-) create mode 100755 egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh new file mode 100755 index 00000000000..2660adb85d7 --- /dev/null +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# 1f is as 1e but a re-tuned model with fewer parameters and a bottleneck at the +# end. + +# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1f_sp +# System tdnn1e10_sp tdnn1f_sp +#WER dev93 (tgpr) 7.29 7.20 +#WER dev93 (tg) 7.08 7.00 +#WER dev93 (big-dict,tgpr) 5.15 5.08 +#WER dev93 (big-dict,fg) 4.52 4.65 +#WER eval92 (tgpr) 5.12 4.93 +#WER eval92 (tg) 4.91 4.66 +#WER eval92 (big-dict,tgpr) 2.94 2.87 +#WER eval92 (big-dict,fg) 2.57 2.39 +# Final train prob -0.0545 -0.0512 +# Final valid prob -0.0650 -0.0641 +# Final train prob (xent) -0.9696 -0.9105 +# Final valid prob (xent) -0.9917 -0.9523 +# Num-params 8067660 6071244 + +# exp/chain/tdnn1e_sp: num-iters=72 nj=2..8 num-params=8.1M dim=40+100->2854 combine=-0.064->-0.063 (over 3) xent:train/valid[47,71,final]=(-1.07,-0.973,-0.970/-1.08,-0.992,-0.992) logprob:train/valid[47,71,final]=(-0.064,-0.056,-0.054/-0.072,-0.066,-0.065) +# exp/chain/tdnn1f_sp: num-iters=72 nj=2..8 num-params=6.1M dim=40+100->2854 combine=-0.061->-0.061 (over 2) xent:train/valid[47,71,final]=(-1.04,-0.911,-0.910/-1.06,-0.953,-0.952) logprob:train/valid[47,71,final]=(-0.063,-0.052,-0.051/-0.071,-0.064,-0.064) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1f #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005 bottleneck-dim=320" + + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=448 + relu-batchnorm-layer name=tdnn2 $opts dim=448 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=448 + relu-batchnorm-layer name=tdnn4 $opts dim=448 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=448 + relu-batchnorm-layer name=tdnn6 $opts dim=448 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=448 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=448 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=448 + output-layer name=output $output_opts include-log-softmax=false dim=$num_targets + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent $opts input=tdnn8 dim=448 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l *M) { // Larger alpha will update faster but will be more prone to instability. I // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or // it will always be unstable. It should be > 0.0. - // The factor of 1/scale^4 is, I *believe*, going to give us the right - // kind of invariance w.r.t. the scale. - BaseFloat alpha = 0.125 / (scale * scale * scale * scale); + // The factor of 1/scale^2 is, I *believe*, going to give us the right + // kind of invariance w.r.t. the scale. With regard to this factor, look at + // the statement + // M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0); where P + // is proportional to scale^2 and M to 'scale', so the RHS is proportional to + // 'scale^3', but we'd like 'M_update' to be proportional to 'scale'. + BaseFloat alpha = 0.125 / (scale * scale); // We're enforcing the rows to be orthonormal. // define P = M M^T. If P is unit then M has orthonormal rows. From 1b9b9e7b6999a57fbf803e03fed3152ed81823f6 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 12 Jan 2018 00:48:49 -0500 Subject: [PATCH 059/184] [scripts] Add orthonormal-constraint options to layers. --- egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 10 +++++++--- egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 8 +++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 0e27e6369e4..c2962de96a7 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -471,6 +471,8 @@ def set_default_configs(self): self.config = {'input': '[-1]', 'dim': -1, 'bottleneck-dim': -1, + 'orthonormal-constraint': 1.0, + # orthonormal-constraint only matters if bottleneck-dim is set. 'include-log-softmax': True, # this would be false for chain models 'objective-type': 'linear', @@ -582,9 +584,11 @@ def _generate_config(self): # note: by default the LinearComponent uses natural gradient. line = ('component name={0}.linear type=LinearComponent ' - 'orthonormal-constraint=1.0 input-dim={1} output-dim={2} ' - 'max-change=0.75 {3}' - ''.format(self.name, input_dim, bottleneck_dim, linear_options)) + 'orthonormal-constraint={1} param-stddev={2} ' + 'input-dim={3} output-dim={4} max-change=0.75 {5}' + ''.format(self.name, self.config['orthonormal-constraint'], + self.config['orthonormal-constraint'] / math.sqrt(input_dim), + input_dim, bottleneck_dim, linear_options)) configs.append(line) line = ('component-node name={0}.linear component={0}.linear input={1}' ''.format(self.name, cur_node)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py index 67537f574e4..131acc254dd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py @@ -829,6 +829,7 @@ def set_default_configs(self): 'clipping-threshold': 30.0, 'zeroing-interval': 20, 'zeroing-threshold': 15.0, + 'orthonormal-constraint': 1.0, 'delay' : -1, 'lstm-nonlinearity-options' : ' max-change=0.75', # the recurrence scale is the scale on m_trunc, used in the @@ -921,9 +922,10 @@ def _generate_lstm_config(self): # constraint, it's meaningless. configs.append("### Begin LTSM layer '{0}'".format(name)) configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} " - "orthonormal-constraint=1.0 output-dim={2} {3}".format( - name, input_dim + cell_dim, bottleneck_dim, - affine_str)) + "orthonormal-constraint={2} output-dim={3} {4}".format( + name, input_dim + cell_dim, + self.config['orthonormal-constraint'], + bottleneck_dim, affine_str)) configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, From 7963c45c72f63c209aaab455e5b8074a64e55c12 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 12 Jan 2018 22:07:12 -0500 Subject: [PATCH 060/184] [src] Bug-fix in nnet3 compilation, RE Scale() expressions --- src/nnet3/nnet-compile.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc index 93f35dc8615..9a4559803ad 100644 --- a/src/nnet3/nnet-compile.cc +++ b/src/nnet3/nnet-compile.cc @@ -608,6 +608,7 @@ BaseFloat Compiler::SplitByScale( } } + int32 num_rows = input_locations_list.size(); split_locations_lists->resize(alpha_to_nodes.size()); // `step_to_index` will map from the step-index to the index into // `split_locations_lists`; each index is associated with a different value of @@ -622,6 +623,7 @@ BaseFloat Compiler::SplitByScale( BaseFloat alpha = iter->first; const std::vector &nodes = iter->second; (*split_locations_lists)[split_locations_index].first = alpha; + (*split_locations_lists)[split_locations_index].second.resize(num_rows); for (size_t i = 0; i < nodes.size(); i++) { int32 node_index = nodes[i]; KALDI_ASSERT(node_to_steps.count(node_index) != 0); @@ -638,7 +640,6 @@ BaseFloat Compiler::SplitByScale( { // This block populates 'split_locations_lists[*].second' with the // split-by-alpha version of 'input_locations_list' - int32 num_rows = input_locations_list.size(); for (int32 r = 0; r < num_rows; r++) { const std::vector > &this_list = input_locations_list[r]; @@ -856,7 +857,7 @@ void Compiler::CompileBackwardSumDescriptor( BaseFloat this_alpha = split_locations_lists[i].first; KALDI_ASSERT(this_alpha - this_alpha == 0.0); std::vector > > submat_locations_list; - ComputeValueSubmatLocationsList(split_locations_lists[i].second, + ComputeDerivSubmatLocationsList(split_locations_lists[i].second, &submat_locations_list); CompileBackwardFromSubmatLocationsList(deriv_submatrix_index, this_alpha, From cbaf7e6c5054d0a1f53d940ee84c684737fa8fe8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 13 Jan 2018 01:11:42 -0500 Subject: [PATCH 061/184] [scripts] Add factorized layer --- .../libs/nnet3/xconfig/factorized_layer.py | 198 ++++++++++++++++++ egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py | 1 + egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 + 3 files changed, 200 insertions(+) create mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py new file mode 100644 index 00000000000..16ba460a04e --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py @@ -0,0 +1,198 @@ +# Copyright 2017-2018 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2017 Google Inc. (vpeddinti@google.com) +# 2017 Vimal Manohar +# Apache 2.0. + +""" This module contains layers that just map to a single component. +""" + +from __future__ import print_function +import math +import re +import sys +from libs.nnet3.xconfig.basic_layers import XconfigLayerBase + + +class XconfigFactorizedLayer(XconfigLayerBase): + """This class is for parsing lines like + 'factorized-layer name=tdnn1 dim=1024 bottleneck-dim=256 bypass-scale=1.0 splicing=-3,0,3' + + This is basically the same as a relu-batchnorm-layer with the bottleneck-dim + set, except that it supports the 'bypass-scale' option, which makes the + whole thing a bit like a res-block. You specify the splicing via the 'splicing' + option instead of via 'input=xxx', as it needs to use the non-spliced inupt for + the bypass. + + Note: the 'dim' is actually optional; it will default to the + dimension of the input, and it must be the same as the dimension of the input. + + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + splicing='0' [In general can be a comma-separated string describing + the TDNN time-offsets, like '-1,0,1' or '-3,0,3'. + Not specified via 'input', because we need the un-spliced + input so that we can do the] bypass. + dim=-1 [Output dimension of layer, e.g. 1024; must be set.] + bottleneck-dim=-1 [Bottleneck dimension, must be set; e.g. 256] + self-repair-scale=1.0e-05 [Affects the relu layer] + learning-rate-factor=1.0 [This can be used to make the affine component + train faster or slower]. + l2-regularize=0.0 [Set this to a nonzero value (e.g. 1.0e-05) to + add l2 regularization on the parameter norm for + this component. + + """ + def __init__(self, first_token, key_to_value, prev_names=None): + assert first_token == "factorized-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + + # note: self.config['input'] is a descriptor, '[-1]' means output + # the most recent layer. + self.config = {'input': '[-1]', + 'dim': -1, + 'bottleneck-dim': -1, + 'self-repair-scale': 1.0e-05, + 'target-rms': 1.0, + 'splicing': '0', + 'bypass-scale': 1.0, + 'ng-affine-options': '', + 'ng-linear-options': '', + # The following are passed through to components. + 'bias-stddev': '', + 'l2-regularize': '', + 'learning-rate-factor': '', + 'max-change': 0.75 } + + def check_configs(self): + input_dim = self.descriptors['input']['dim'] + + if self.config['dim'] == -1: + self.config['dim'] = input_dim + elif self.config['dim'] != input_dim: + raise RuntimeError("Dimension mismatch: dim={0} vs. input-dim={1}".format( + self.config['dim'], input_dim)) + b = self.config['bottleneck-dim'] + if b <= 0 or b >= self.config['dim']: + raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b)) + + if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: + raise RuntimeError("self-repair-scale has invalid value {0}" + .format(self.config['self-repair-scale'])) + if self.config['target-rms'] < 0.0: + raise RuntimeError("target-rms has invalid value {0}" + .format(self.config['target-rms'])) + if self.config['learning-rate-factor'] <= 0.0: + raise RuntimeError("learning-rate-factor has invalid value {0}" + .format(self.config['learning-rate-factor'])) + + splicing = self.config['splicing'] + try: + splicing_array = [ int(x) for x in splicing.split(',') ] + if not 0 in splicing_array: + raise RuntimeError("0 should probably be in the splicing indexes.") + except: + raise RuntimeError("Invalid option splicing={0}".format(splicing)) + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + # return something like: tdnn3.batchnorm + return '{0}.batchnorm'.format(self.name) + + def output_dim(self, auxiliary_output=None): + output_dim = self.config['dim'] + # If not set, the output-dim defaults to the input-dim. + if output_dim <= 0: + self.config['dim'] = self.descriptors['input']['dim'] + return output_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + bottleneck_dim = self.config['bottleneck-dim'] + output_dim = input_dim + self_repair_scale = self.config['self-repair-scale'] + target_rms = self.config['target-rms'] + bypass_scale = self.config['bypass-scale'] + splicing_array = [ int(x) for x in self.config['splicing'].split(',') ] + spliced_input_desc = 'Append({0})'.format( + ', '.join([ 'Offset({0}, {1})'.format(input_desc, offset) + for offset in splicing_array ])) + # e.g. spliced_input_desc = + # 'Append(Offset(tdnn2, -1), Offset(tdnn2, 0), Offset(tdnn2, 1))' + + spliced_input_dim = input_dim * len(splicing_array) + + affine_options = self.config['ng-affine-options'] + for opt_name in [ 'max-change', 'learning-rate-factor', + 'bias-stddev', 'l2-regularize' ]: + value = self.config[opt_name] + if value != '': + affine_options += ' {0}={1}'.format(opt_name, value) + + linear_options = self.config['ng-linear-options'] + for opt_name in [ 'max-change', 'learning-rate-factor' ]: + value = self.config[opt_name] + if value != '': + linear_options += ' {0}={1}'.format(opt_name, value) + + configs = [] + + # First the linear component that goes to the bottleneck dim. + # note: by default the LinearComponent uses natural gradient. + line = ('component name={0}.linear type=LinearComponent ' + 'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}' + ''.format(self.name, spliced_input_dim, bottleneck_dim, + linear_options)) + configs.append(line) + line = ('component-node name={0}.linear component={0}.linear input={1}' + ''.format(self.name, spliced_input_desc)) + configs.append(line) + + # Now the affine component + line = ('component name={0}.affine type=NaturalGradientAffineComponent' + ' input-dim={1} output-dim={2} {3}' + ''.format(self.name, bottleneck_dim, output_dim, affine_options)) + configs.append(line) + line = ('component-node name={0}.affine component={0}.affine input={0}.linear' + ''.format(self.name)) + configs.append(line) + + # now the ReLU. Its input is the output of the affine component plus + # the non-sliced input (this is a bit like a res-block). + line = ('component name={0}.relu type=RectifiedLinearComponent dim={1}' + ' self-repair-scale={2}' + ''.format(self.name, output_dim, self_repair_scale)) + configs.append(line) + if bypass_scale != 0.0: + line = ('component-node name={0}.relu component={0}.relu ' + 'input=Sum(Scale({1}, {2}), {0}.affine) ' + ''.format(self.name, bypass_scale, input_desc)) + else: + line = ('component-node name={0}.relu component={0}.relu ' + 'input={0}.affine'.format(self.name)) + configs.append(line) + + line = ('component name={0}.batchnorm type=BatchNormComponent ' + 'dim={1} target-rms={2}' + ''.format(self.name, output_dim, target_rms)) + configs.append(line) + line = ('component-node name={0}.batchnorm component={0}.batchnorm ' + 'input={0}.relu'.format(self.name)) + configs.append(line) + + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py index 593fbbb745c..d609fb2e685 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py @@ -10,3 +10,4 @@ from gru import * from stats_layer import * from trivial_layers import * +from factorized_layer import * diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index c6b0619bca8..ec9137eadd6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -68,6 +68,7 @@ 'opgru-layer' : xlayers.XconfigOpgruLayer, 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, + 'factorized-layer': xlayers.XconfigFactorizedLayer, 'renorm-component': xlayers.XconfigRenormComponent, 'no-op-component': xlayers.XconfigNoOpComponent } From b4f0585800102fe6a0c3b8a767b004694b669417 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 14 Jan 2018 14:43:31 -0500 Subject: [PATCH 062/184] [src] Small bug-fix affecting info output of LinearComponent --- src/nnet3/nnet-simple-component.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index ea5a2489bc4..b4138cc989e 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -3050,7 +3050,7 @@ std::string LinearComponent::Info() const { stream << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false") << ", rank-in=" << preconditioner_in_.GetRank() - << ", rank-out=" << preconditioner_in_.GetRank() + << ", rank-out=" << preconditioner_out_.GetRank() << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() << ", update-period=" << preconditioner_in_.GetUpdatePeriod() From 096a42b59f8c89b4eb444c280b937fb0b8814f3e Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 15 Jan 2018 14:28:28 -0500 Subject: [PATCH 063/184] [scripts] Add options to factorized-layer --- .../libs/nnet3/xconfig/factorized_layer.py | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py index 16ba460a04e..e19cf014ab0 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py @@ -57,10 +57,14 @@ def set_default_configs(self): 'bottleneck-dim': -1, 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, + 'extra-relu': False, 'splicing': '0', 'bypass-scale': 1.0, 'ng-affine-options': '', 'ng-linear-options': '', + # if second-matrix-orthonormal, the 2nd matrix + # has the orthonormal constraint. + 'second-matrix-orthonormal': False, # The following are passed through to components. 'bias-stddev': '', 'l2-regularize': '', @@ -132,6 +136,8 @@ def _generate_config(self): spliced_input_desc = 'Append({0})'.format( ', '.join([ 'Offset({0}, {1})'.format(input_desc, offset) for offset in splicing_array ])) + extra_relu = self.config['extra-relu'] + # e.g. spliced_input_desc = # 'Append(Offset(tdnn2, -1), Offset(tdnn2, 0), Offset(tdnn2, 1))' @@ -150,12 +156,20 @@ def _generate_config(self): if value != '': linear_options += ' {0}={1}'.format(opt_name, value) + if self.config['second-matrix-orthonormal']: + # we have to mess with the range of the parameters so they are within + # the circle of convergence... + affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format( + math.sqrt(1.0 / output_dim)) + else: + linear_options += ' orthonormal-constraint=1.0' + configs = [] # First the linear component that goes to the bottleneck dim. # note: by default the LinearComponent uses natural gradient. line = ('component name={0}.linear type=LinearComponent ' - 'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}' + 'input-dim={1} output-dim={2} {3}' ''.format(self.name, spliced_input_dim, bottleneck_dim, linear_options)) configs.append(line) @@ -163,13 +177,24 @@ def _generate_config(self): ''.format(self.name, spliced_input_desc)) configs.append(line) + if extra_relu: + # add a relu between the linear and the affine. + line = ('component name={0}.relu0 type=RectifiedLinearComponent dim={1}' + ' self-repair-scale={2}' + ''.format(self.name, bottleneck_dim, self_repair_scale)) + configs.append(line) + line = ('component-node name={0}.relu0 component={0}.relu0 ' + 'input={0}.linear'.format(self.name)) + configs.append(line) + + # Now the affine component line = ('component name={0}.affine type=NaturalGradientAffineComponent' ' input-dim={1} output-dim={2} {3}' ''.format(self.name, bottleneck_dim, output_dim, affine_options)) configs.append(line) - line = ('component-node name={0}.affine component={0}.affine input={0}.linear' - ''.format(self.name)) + line = ('component-node name={0}.affine component={0}.affine input={0}.{1}' + ''.format(self.name, ('relu0' if extra_relu else 'linear'))) configs.append(line) # now the ReLU. Its input is the output of the affine component plus From 168a642f2b7dff7e2fef0f2ad348288c829738f0 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 15 Jan 2018 14:29:43 -0500 Subject: [PATCH 064/184] [src] Allow to apply orthonormal constraint on affine component --- .../steps/libs/nnet3/xconfig/basic_layers.py | 22 +++++++- src/nnet3/nnet-simple-component.cc | 33 +++++++++++- src/nnet3/nnet-simple-component.h | 18 ++++++- src/nnet3/nnet-utils.cc | 54 ++++++++++++------- src/nnet3/nnet-utils.h | 3 +- 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index c2962de96a7..a6de5d163c0 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -655,7 +655,15 @@ class XconfigBasicLayer(XconfigLayerBase): Parameters of the class, and their defaults: input='[-1]' [Descriptor giving the input of the layer.] - dim=None [Output dimension of layer, e.g. 1024] + dim=-1 [Output dimension of layer, e.g. 1024] + bottleneck-dim=-1 [If you set this, a linear bottleneck is added, so + we project to first bottleneck-dim then to dim. One + of the two matrices is constrained to be orthonormal; + see 'second-matrix-orthonormal'.] + second-matrix-orthonormal=False [Only makes a difference if bottleneck-dim>0. + You can set this to true if you want the orthormal-rows + constraint to be applied to the 2nd, not the first, of + the two marices.] self-repair-scale=1.0e-05 [Affects relu, sigmoid and tanh layers.] learning-rate-factor=1.0 [This can be used to make the affine component train faster or slower]. @@ -676,6 +684,7 @@ def set_default_configs(self): self.config = {'input': '[-1]', 'dim': -1, 'bottleneck-dim': -1, + 'second-matrix-orthonormal': False, 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, 'ng-affine-options': '', @@ -790,10 +799,12 @@ def _add_components(self, input_desc, input_dim, nonlinearities): value = self.config[opt_name] if value != '': linear_options += ' {0}={1}'.format(opt_name, value) + if not self.config['second-matrix-orthonormal']: + linear_options += ' orthonormal-constraint=1.0' bottleneck_dim = self.config['bottleneck-dim'] # note: by default the LinearComponent uses natural gradient. line = ('component name={0}.linear type=LinearComponent ' - 'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}' + 'input-dim={1} output-dim={2} {3}' ''.format(self.name, input_dim, bottleneck_dim, linear_options)) configs.append(line) line = ('component-node name={0}.linear component={0}.linear input={1}' @@ -803,6 +814,13 @@ def _add_components(self, input_desc, input_dim, nonlinearities): cur_dim = bottleneck_dim + if self.config['second-matrix-orthonormal']: + assert self.config['bottleneck-dim'] > 0 + # we have to mess with the range of the parameters so they are within + # the circle of convergence... + affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format( + math.sqrt(1.0 / output_dim)) + line = ('component name={0}.affine type=NaturalGradientAffineComponent' ' input-dim={1} output-dim={2} {3}' ''.format(self.name, cur_dim, output_dim, affine_options)) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index b4138cc989e..01adb222372 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -1031,13 +1031,15 @@ void AffineComponent::Add(BaseFloat alpha, const Component &other_in) { AffineComponent::AffineComponent(const AffineComponent &component): UpdatableComponent(component), linear_params_(component.linear_params_), - bias_params_(component.bias_params_) { } + bias_params_(component.bias_params_), + orthonormal_constraint_(component.orthonormal_constraint_) { } AffineComponent::AffineComponent(const CuMatrixBase &linear_params, const CuVectorBase &bias_params, BaseFloat learning_rate): linear_params_(linear_params), - bias_params_(bias_params) { + bias_params_(bias_params), + orthonormal_constraint_(0.0) { SetUnderlyingLearningRate(learning_rate); KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&& bias_params.Dim() != 0); @@ -1063,6 +1065,8 @@ void AffineComponent::PerturbParams(BaseFloat stddev) { std::string AffineComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); + if (orthonormal_constraint_ != 0.0) + stream << ", orthonormal-constraint=" << orthonormal_constraint_; PrintParameterStats(stream, "linear-params", linear_params_, false, // include_mean true, // include_row_norms @@ -1129,6 +1133,8 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) { Init(input_dim, output_dim, param_stddev, bias_stddev); } + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); @@ -1197,6 +1203,12 @@ void AffineComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + } else { + orthonormal_constraint_ = 0.0; + } ExpectToken(is, binary, ""); } @@ -1206,6 +1218,10 @@ void AffineComponent::Write(std::ostream &os, bool binary) const { linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); + if (orthonormal_constraint_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + } WriteToken(os, binary, ""); } @@ -2664,6 +2680,12 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &rank_in); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_out); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + } else { + orthonormal_constraint_ = 0.0; + } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &update_period); ExpectToken(is, binary, ""); @@ -2770,6 +2792,9 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { bias_params_.Add(bias_mean); } + orthonormal_constraint_ = 0.0; + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + // Set natural-gradient configs. BaseFloat num_samples_history = 2000.0, alpha = 4.0; @@ -2807,6 +2832,10 @@ void NaturalGradientAffineComponent::Write(std::ostream &os, WriteBasicType(os, binary, preconditioner_in_.GetRank()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_out_.GetRank()); + if (orthonormal_constraint_ != 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod()); WriteToken(os, binary, ""); diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index f596ec6be75..b1eb30a55bf 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -390,10 +390,11 @@ class AffineComponent: public UpdatableComponent { virtual int32 InputDim() const { return linear_params_.NumCols(); } virtual int32 OutputDim() const { return linear_params_.NumRows(); } + BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); - AffineComponent() { } // use Init to really initialize. + AffineComponent(): orthonormal_constraint_(0.0) { } // use Init to really initialize. virtual std::string Type() const { return "AffineComponent"; } virtual int32 Properties() const { return kSimpleComponent|kUpdatableComponent| @@ -434,6 +435,7 @@ class AffineComponent: public UpdatableComponent { const CuMatrixBase &linear); const CuVector &BiasParams() const { return bias_params_; } const CuMatrix &LinearParams() const { return linear_params_; } + CuMatrix &LinearParams() { return linear_params_; } explicit AffineComponent(const AffineComponent &other); // The next constructor is used in converting from nnet1. AffineComponent(const CuMatrixBase &linear_params, @@ -466,6 +468,7 @@ class AffineComponent: public UpdatableComponent { const AffineComponent &operator = (const AffineComponent &other); // Disallow. CuMatrix linear_params_; CuVector bias_params_; + BaseFloat orthonormal_constraint_; }; class RepeatedAffineComponent; @@ -755,6 +758,19 @@ class LogSoftmaxComponent: public NonlinearComponent { Dimension is output-dim by (input-dim + 1), last column is interpreted as the bias. + Other options: + orthonormal-constraint=0.0 If you set this to 1.0, then + the linear_params_ matrix will be (approximately) + constrained during training to have orthonormal rows + (or columns, whichever is fewer). You can choose a + positive nonzero value different than 1.0 to have a + scaled orthonormal matrix, i.e. with singular values + at the selected value (e.g. 0.5, or 2.0). This is + not enforced inside the component itself; you have to + call ConstrainOrthonormal() from the training code to + do this. All this component does is return the + OrthonormalConstraint() value. + Options to the natural gradient (you won't normally have to set these, the defaults are suitable): diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index dff4cdbee74..cc5762474d6 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -907,30 +907,48 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase *M) { /** This function, to be called after processing every minibatch, is responsible for enforcing the orthogonality constraint for any components of type - LinearComponent that have the "orthonormal_constraint" value set. + LinearComponent or inheriting from AffineComponent that have the + "orthonormal_constraint" value set. */ void ConstrainOrthonormal(Nnet *nnet) { + for (int32 c = 0; c < nnet->NumComponents(); c++) { Component *component = nnet->GetComponent(c); LinearComponent *lc = dynamic_cast(component); - if (lc == NULL || lc->OrthonormalConstraint() == 0.0) - continue; - if (RandInt(0, 3) != 0) - continue; // For efficiency, only do this every 4 minibatches-- it won't - // stray far. - - - BaseFloat scale = lc->OrthonormalConstraint(); - KALDI_ASSERT(scale > 0.0); + if (lc != NULL && lc->OrthonormalConstraint() != 0.0) { + if (RandInt(0, 3) != 0) + continue; // For efficiency, only do this every 4 minibatches-- it won't + // stray far. + BaseFloat scale = lc->OrthonormalConstraint(); + KALDI_ASSERT(scale > 0.0); + + CuMatrixBase ¶ms = lc->Params(); + int32 rows = params.NumRows(), cols = params.NumCols(); + if (rows <= cols) { + ConstrainOrthonormalInternal(scale, ¶ms); + } else { + CuMatrix params_trans(params, kTrans); + ConstrainOrthonormalInternal(scale, ¶ms_trans); + params.CopyFromMat(params_trans, kTrans); + } + } - CuMatrixBase ¶ms = lc->Params(); - int32 rows = params.NumRows(), cols = params.NumCols(); - if (rows <= cols) { - ConstrainOrthonormalInternal(scale, ¶ms); - } else { - CuMatrix params_trans(params, kTrans); - ConstrainOrthonormalInternal(scale, ¶ms_trans); - params.CopyFromMat(params_trans, kTrans); + AffineComponent *ac = dynamic_cast(component); + if (ac != NULL && ac->OrthonormalConstraint() != 0.0) { + if (RandInt(0, 3) != 0) + continue; // For efficiency, only do this every 4 minibatches-- it won't + // stray far. + BaseFloat scale = ac->OrthonormalConstraint(); + KALDI_ASSERT(scale > 0.0); + CuMatrixBase ¶ms = ac->LinearParams(); + int32 rows = params.NumRows(), cols = params.NumCols(); + if (rows <= cols) { + ConstrainOrthonormalInternal(scale, ¶ms); + } else { + CuMatrix params_trans(params, kTrans); + ConstrainOrthonormalInternal(scale, ¶ms_trans); + params.CopyFromMat(params_trans, kTrans); + } } } } diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index fc1631a8d77..efa36e1f64c 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -453,7 +453,8 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, /** This function, to be called after processing every minibatch, is responsible for enforcing the orthogonality constraint for any components of type - LinearComponent that have the "orthonormal-constraint" value set to nonzero. + LinearComponent or inheriting from AffineComponent that have the + "orthonormal-constraint" value set to nonzero. In order to make it efficient on GPU, it doesn't make it completely orthonormal, it just makes it closer to being orthonormal (times the 'orthonormal_constraint' From e3ea3c8e0928113a9486d2d97ced0bb1ed053631 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 15 Jan 2018 14:33:11 -0500 Subject: [PATCH 065/184] [scripts] update chain_dir_info.pl to handle no chain l2-regularize --- egs/wsj/s5/steps/info/chain_dir_info.pl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl index d0fac5292c6..cda271f9724 100755 --- a/egs/wsj/s5/steps/info/chain_dir_info.pl +++ b/egs/wsj/s5/steps/info/chain_dir_info.pl @@ -139,7 +139,7 @@ sub get_combine_info { return sprintf(" combine=%.3f->%.3f", $1, $2); } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) { close(F); - return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); + return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); } } } @@ -204,6 +204,9 @@ sub get_logprob_and_accuracy_info { if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) { $iter_to_train_logprob{$iter} = $1; $iter_to_train_penalty{$iter} = $2; + } elsif (m/Overall log-probability for 'output' is (\S+)/) { + $iter_to_train_logprob{$iter} = $1; + $iter_to_train_penalty{$iter} = 0.0; } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) { $iter_to_train_xent{$iter} = $1; } @@ -213,6 +216,9 @@ sub get_logprob_and_accuracy_info { if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) { $iter_to_valid_logprob{$iter} = $1; $iter_to_valid_penalty{$iter} = $2; + } elsif (m/Overall log-probability for 'output' is (\S+)/) { + $iter_to_valid_logprob{$iter} = $1; + $iter_to_valid_penalty{$iter} = 0.0; } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) { $iter_to_valid_xent{$iter} = $1; } From d2a1485a4a621cc23e53a1587b495090d4b47abc Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 15 Jan 2018 14:36:05 -0500 Subject: [PATCH 066/184] [scripts] Remove factorized-layer --- .../libs/nnet3/xconfig/factorized_layer.py | 223 ------------------ egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py | 1 - egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 - 3 files changed, 225 deletions(-) delete mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py deleted file mode 100644 index e19cf014ab0..00000000000 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright 2017-2018 Johns Hopkins University (Dan Povey) -# 2016 Vijayaditya Peddinti -# 2017 Google Inc. (vpeddinti@google.com) -# 2017 Vimal Manohar -# Apache 2.0. - -""" This module contains layers that just map to a single component. -""" - -from __future__ import print_function -import math -import re -import sys -from libs.nnet3.xconfig.basic_layers import XconfigLayerBase - - -class XconfigFactorizedLayer(XconfigLayerBase): - """This class is for parsing lines like - 'factorized-layer name=tdnn1 dim=1024 bottleneck-dim=256 bypass-scale=1.0 splicing=-3,0,3' - - This is basically the same as a relu-batchnorm-layer with the bottleneck-dim - set, except that it supports the 'bypass-scale' option, which makes the - whole thing a bit like a res-block. You specify the splicing via the 'splicing' - option instead of via 'input=xxx', as it needs to use the non-spliced inupt for - the bypass. - - Note: the 'dim' is actually optional; it will default to the - dimension of the input, and it must be the same as the dimension of the input. - - - Parameters of the class, and their defaults: - input='[-1]' [Descriptor giving the input of the layer.] - splicing='0' [In general can be a comma-separated string describing - the TDNN time-offsets, like '-1,0,1' or '-3,0,3'. - Not specified via 'input', because we need the un-spliced - input so that we can do the] bypass. - dim=-1 [Output dimension of layer, e.g. 1024; must be set.] - bottleneck-dim=-1 [Bottleneck dimension, must be set; e.g. 256] - self-repair-scale=1.0e-05 [Affects the relu layer] - learning-rate-factor=1.0 [This can be used to make the affine component - train faster or slower]. - l2-regularize=0.0 [Set this to a nonzero value (e.g. 1.0e-05) to - add l2 regularization on the parameter norm for - this component. - - """ - def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token == "factorized-layer" - XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) - - def set_default_configs(self): - - # note: self.config['input'] is a descriptor, '[-1]' means output - # the most recent layer. - self.config = {'input': '[-1]', - 'dim': -1, - 'bottleneck-dim': -1, - 'self-repair-scale': 1.0e-05, - 'target-rms': 1.0, - 'extra-relu': False, - 'splicing': '0', - 'bypass-scale': 1.0, - 'ng-affine-options': '', - 'ng-linear-options': '', - # if second-matrix-orthonormal, the 2nd matrix - # has the orthonormal constraint. - 'second-matrix-orthonormal': False, - # The following are passed through to components. - 'bias-stddev': '', - 'l2-regularize': '', - 'learning-rate-factor': '', - 'max-change': 0.75 } - - def check_configs(self): - input_dim = self.descriptors['input']['dim'] - - if self.config['dim'] == -1: - self.config['dim'] = input_dim - elif self.config['dim'] != input_dim: - raise RuntimeError("Dimension mismatch: dim={0} vs. input-dim={1}".format( - self.config['dim'], input_dim)) - b = self.config['bottleneck-dim'] - if b <= 0 or b >= self.config['dim']: - raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b)) - - if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: - raise RuntimeError("self-repair-scale has invalid value {0}" - .format(self.config['self-repair-scale'])) - if self.config['target-rms'] < 0.0: - raise RuntimeError("target-rms has invalid value {0}" - .format(self.config['target-rms'])) - if self.config['learning-rate-factor'] <= 0.0: - raise RuntimeError("learning-rate-factor has invalid value {0}" - .format(self.config['learning-rate-factor'])) - - splicing = self.config['splicing'] - try: - splicing_array = [ int(x) for x in splicing.split(',') ] - if not 0 in splicing_array: - raise RuntimeError("0 should probably be in the splicing indexes.") - except: - raise RuntimeError("Invalid option splicing={0}".format(splicing)) - - def output_name(self, auxiliary_output=None): - assert auxiliary_output is None - # return something like: tdnn3.batchnorm - return '{0}.batchnorm'.format(self.name) - - def output_dim(self, auxiliary_output=None): - output_dim = self.config['dim'] - # If not set, the output-dim defaults to the input-dim. - if output_dim <= 0: - self.config['dim'] = self.descriptors['input']['dim'] - return output_dim - - def get_full_config(self): - ans = [] - config_lines = self._generate_config() - - for line in config_lines: - for config_name in ['ref', 'final']: - # we do not support user specified matrices in this layer - # so 'ref' and 'final' configs are the same. - ans.append((config_name, line)) - return ans - - def _generate_config(self): - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - bottleneck_dim = self.config['bottleneck-dim'] - output_dim = input_dim - self_repair_scale = self.config['self-repair-scale'] - target_rms = self.config['target-rms'] - bypass_scale = self.config['bypass-scale'] - splicing_array = [ int(x) for x in self.config['splicing'].split(',') ] - spliced_input_desc = 'Append({0})'.format( - ', '.join([ 'Offset({0}, {1})'.format(input_desc, offset) - for offset in splicing_array ])) - extra_relu = self.config['extra-relu'] - - # e.g. spliced_input_desc = - # 'Append(Offset(tdnn2, -1), Offset(tdnn2, 0), Offset(tdnn2, 1))' - - spliced_input_dim = input_dim * len(splicing_array) - - affine_options = self.config['ng-affine-options'] - for opt_name in [ 'max-change', 'learning-rate-factor', - 'bias-stddev', 'l2-regularize' ]: - value = self.config[opt_name] - if value != '': - affine_options += ' {0}={1}'.format(opt_name, value) - - linear_options = self.config['ng-linear-options'] - for opt_name in [ 'max-change', 'learning-rate-factor' ]: - value = self.config[opt_name] - if value != '': - linear_options += ' {0}={1}'.format(opt_name, value) - - if self.config['second-matrix-orthonormal']: - # we have to mess with the range of the parameters so they are within - # the circle of convergence... - affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format( - math.sqrt(1.0 / output_dim)) - else: - linear_options += ' orthonormal-constraint=1.0' - - configs = [] - - # First the linear component that goes to the bottleneck dim. - # note: by default the LinearComponent uses natural gradient. - line = ('component name={0}.linear type=LinearComponent ' - 'input-dim={1} output-dim={2} {3}' - ''.format(self.name, spliced_input_dim, bottleneck_dim, - linear_options)) - configs.append(line) - line = ('component-node name={0}.linear component={0}.linear input={1}' - ''.format(self.name, spliced_input_desc)) - configs.append(line) - - if extra_relu: - # add a relu between the linear and the affine. - line = ('component name={0}.relu0 type=RectifiedLinearComponent dim={1}' - ' self-repair-scale={2}' - ''.format(self.name, bottleneck_dim, self_repair_scale)) - configs.append(line) - line = ('component-node name={0}.relu0 component={0}.relu0 ' - 'input={0}.linear'.format(self.name)) - configs.append(line) - - - # Now the affine component - line = ('component name={0}.affine type=NaturalGradientAffineComponent' - ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, bottleneck_dim, output_dim, affine_options)) - configs.append(line) - line = ('component-node name={0}.affine component={0}.affine input={0}.{1}' - ''.format(self.name, ('relu0' if extra_relu else 'linear'))) - configs.append(line) - - # now the ReLU. Its input is the output of the affine component plus - # the non-sliced input (this is a bit like a res-block). - line = ('component name={0}.relu type=RectifiedLinearComponent dim={1}' - ' self-repair-scale={2}' - ''.format(self.name, output_dim, self_repair_scale)) - configs.append(line) - if bypass_scale != 0.0: - line = ('component-node name={0}.relu component={0}.relu ' - 'input=Sum(Scale({1}, {2}), {0}.affine) ' - ''.format(self.name, bypass_scale, input_desc)) - else: - line = ('component-node name={0}.relu component={0}.relu ' - 'input={0}.affine'.format(self.name)) - configs.append(line) - - line = ('component name={0}.batchnorm type=BatchNormComponent ' - 'dim={1} target-rms={2}' - ''.format(self.name, output_dim, target_rms)) - configs.append(line) - line = ('component-node name={0}.batchnorm component={0}.batchnorm ' - 'input={0}.relu'.format(self.name)) - configs.append(line) - - return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py index d609fb2e685..593fbbb745c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py @@ -10,4 +10,3 @@ from gru import * from stats_layer import * from trivial_layers import * -from factorized_layer import * diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index ec9137eadd6..c6b0619bca8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -68,7 +68,6 @@ 'opgru-layer' : xlayers.XconfigOpgruLayer, 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, - 'factorized-layer': xlayers.XconfigFactorizedLayer, 'renorm-component': xlayers.XconfigRenormComponent, 'no-op-component': xlayers.XconfigNoOpComponent } From 99137449b70cc6cbc3f260b213750b977d95ee84 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 16 Jan 2018 13:29:12 -0500 Subject: [PATCH 067/184] [egs] update swbd compare_wer_general.sh to include rt03 --- .../s5c/local/chain/compare_wer_general.sh | 56 +++++++++++++++++-- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh index c4c3d182bfe..fcd66d5d78d 100755 --- a/egs/swbd/s5c/local/chain/compare_wer_general.sh +++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh @@ -8,10 +8,18 @@ echo "# $0 $*"; # print command line. include_looped=false -if [ "$1" == "--looped" ]; then - include_looped=true - shift -fi +include_rt03=false + +for x in $(seq 3); do + if [ "$1" == "--looped" ]; then + include_looped=true + shift + fi + if [ "$1" == "--rt03" ]; then + include_rt03=true + shift + fi +done echo -n "# System " for x in $*; do printf " % 9s" $x; done @@ -120,6 +128,46 @@ if $include_looped; then fi +if $include_rt03; then + echo -n "# WER on rt03(tg) " + for x in $*; do + set_names $x + wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x + wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + + echo -n "# WER on rt03(fg) " + for x in $*; do + set_names $x + wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x + wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +fi + + + if $used_epochs; then # we don't print the probs in this case. exit 0 From 69359940ff26654b2c6fc7f94a22ed2b7f4af1a3 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 16 Jan 2018 19:04:17 -0500 Subject: [PATCH 068/184] [src,egs] Somer nnet3 fixes that shouldn't affect anything; adding some example scripts. --- .../s5c/local/chain/tuning/run_tdnn_7m.sh | 5 +- .../s5c/local/chain/tuning/run_tdnn_7m19c.sh | 383 ++++++++++ .../local/chain/tuning/run_tdnn_lstm_1m.sh | 7 +- egs/wsj/s5/local/chain/run_tdnn.sh | 2 +- egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh | 32 +- .../local/chain/tuning/run_tdnn_lstm_1b22c.sh | 680 ++++++++++++++++++ .../internal/resolve_ctm_edits_overlaps.py | 2 +- src/nnet3/nnet-analyze.cc | 20 +- src/nnet3/nnet-analyze.h | 9 +- src/nnet3/nnet-optimize-utils.cc | 13 +- src/nnet3/nnet-optimize.cc | 4 +- 11 files changed, 1125 insertions(+), 32 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh create mode 100755 egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh index 552e944c05a..03b1ee3c97f 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh @@ -31,6 +31,7 @@ speed_perturb=true dir=exp/chain/tdnn_7m # Note: _sp will get added to this if $speed_perturb == true. decode_iter= decode_nj=50 +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi # training options num_epochs=4 @@ -214,7 +215,7 @@ if [ ! -z $decode_iter ]; then fi if [ $stage -le 15 ]; then rm $dir/.error 2>/dev/null || true - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ @@ -243,7 +244,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then $lang exp/nnet3/extractor $dir ${dir}_online rm $dir/.error 2>/dev/null || true - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( # note: we just give it "$decode_set" as it only uses the wav.scp, the # feature type does not matter. diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh new file mode 100755 index 00000000000..8cc029744a1 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh @@ -0,0 +1,383 @@ +#!/bin/bash +# Note: before merging to master, this will be renamed. + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m19c +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn3 $opts dim=1280 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn5 $opts dim=1280 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9) dim=1280 bottleneck-dim=256 + + relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1280 bottleneck-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1280 bottleneck-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh index 1d566290163..4b2c93082d9 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -6,7 +6,7 @@ # After comparing different combinations of dropout(with or without) and decay-time # option(20, 40 or without), we found this setup is best. -#System tdnn_lstm_1l_ld5 tdnn_lstm_1m_ld 1m_online +#System tdnn_lstm_1l_ld5 tdnn_lstm_1m_ld 1m_online #WER on train_dev(tg) 12.41 12.37 12.21 #WER on train_dev(fg) 11.59 11.46 11.41 #WER on eval2000(tg) 14.8 14.8 14.9 @@ -30,6 +30,7 @@ dir=exp/chain/tdnn_lstm_1m # Note: _sp will get added to this if $speed_perturb decode_iter= decode_dir_affix= decode_nj=50 +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi # training options leftmost_questions_truncate=-1 @@ -227,7 +228,7 @@ if [ $stage -le 15 ]; then if [ ! -z $decode_iter ]; then iter_opts=" --iter $decode_iter " fi - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj 50 --cmd "$decode_cmd" $iter_opts \ @@ -257,7 +258,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then $lang exp/nnet3/extractor $dir ${dir}_online rm $dir/.error 2>/dev/null || true - for decode_set in train_dev eval2000; do + for decode_set in train_dev eval2000 rt03; do ( # note: we just give it "$decode_set" as it only uses the wav.scp, the # feature type does not matter. diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh index 75da1a0a553..cb5756188a4 120000 --- a/egs/wsj/s5/local/chain/run_tdnn.sh +++ b/egs/wsj/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1e.sh \ No newline at end of file +tuning/run_tdnn_1f.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh index 2660adb85d7..be8d39de80b 100755 --- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh @@ -1,24 +1,26 @@ #!/bin/bash # 1f is as 1e but a re-tuned model with fewer parameters and a bottleneck at the -# end. +# end, and no chain l2-regularize +#[note: was 1e12e.] -# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1f_sp -# System tdnn1e10_sp tdnn1f_sp +# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp +# System tdnn1e10_sp tdnn1e12e_sp #WER dev93 (tgpr) 7.29 7.20 -#WER dev93 (tg) 7.08 7.00 -#WER dev93 (big-dict,tgpr) 5.15 5.08 -#WER dev93 (big-dict,fg) 4.52 4.65 -#WER eval92 (tgpr) 5.12 4.93 -#WER eval92 (tg) 4.91 4.66 -#WER eval92 (big-dict,tgpr) 2.94 2.87 -#WER eval92 (big-dict,fg) 2.57 2.39 -# Final train prob -0.0545 -0.0512 -# Final valid prob -0.0650 -0.0641 -# Final train prob (xent) -0.9696 -0.9105 -# Final valid prob (xent) -0.9917 -0.9523 +#WER dev93 (tg) 7.08 6.81 +#WER dev93 (big-dict,tgpr) 5.15 5.04 +#WER dev93 (big-dict,fg) 4.52 4.42 +#WER eval92 (tgpr) 5.12 4.80 +#WER eval92 (tg) 4.91 4.54 +#WER eval92 (big-dict,tgpr) 2.94 2.76 +#WER eval92 (big-dict,fg) 2.57 2.30 +# Final train prob -0.0545 -0.0455 +# Final valid prob -0.0650 -0.0599 +# Final train prob (xent) -0.9696 -0.9060 +# Final valid prob (xent) -0.9917 -0.9448 # Num-params 8067660 6071244 + # exp/chain/tdnn1e_sp: num-iters=72 nj=2..8 num-params=8.1M dim=40+100->2854 combine=-0.064->-0.063 (over 3) xent:train/valid[47,71,final]=(-1.07,-0.973,-0.970/-1.08,-0.992,-0.992) logprob:train/valid[47,71,final]=(-0.064,-0.056,-0.054/-0.072,-0.066,-0.065) # exp/chain/tdnn1f_sp: num-iters=72 nj=2..8 num-params=6.1M dim=40+100->2854 combine=-0.061->-0.061 (over 2) xent:train/valid[47,71,final]=(-1.04,-0.911,-0.910/-1.06,-0.953,-0.952) logprob:train/valid[47,71,final]=(-0.063,-0.052,-0.051/-0.071,-0.064,-0.064) @@ -216,7 +218,7 @@ if [ $stage -le 16 ]; then --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.00005 \ + --chain.l2-regularize=0.0 \ --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=2000" \ --trainer.srand=$srand \ diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh new file mode 100755 index 00000000000..0e5ba084f71 --- /dev/null +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh @@ -0,0 +1,680 @@ +#!/bin/bash +#TODO: this needs to be renamed and the comments changed, before merging to master. + +# 1b22c is as 1b22 but setting label delay to 8. It improves on average, even +# if not everywhere. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn_lstm1b21_sp exp/chain/tdnn_lstm1b22_sp exp/chain/tdnn_lstm1b22c_sp +# System tdnn_lstm1a_sp tdnn_lstm1b21_sp tdnn_lstm1b22_sp tdnn_lstm1b22c_sp +#WER dev93 (tgpr) 7.64 7.69 7.47 7.24 +#WER dev93 (tg) 7.29 7.27 7.14 7.03 +#WER dev93 (big-dict,tgpr) 5.53 5.42 5.31 5.04 +#WER dev93 (big-dict,fg) 5.14 5.04 5.00 4.92 +#WER eval92 (tgpr) 5.62 5.19 5.14 5.23 +#WER eval92 (tg) 5.30 5.00 4.93 4.78 +#WER eval92 (big-dict,tgpr) 3.62 3.24 3.12 3.17 +#WER eval92 (big-dict,fg) 3.31 2.96 2.73 2.73 +# Final train prob -0.0344 -0.0470 -0.0401 -0.0403 +# Final valid prob -0.0518 -0.0587 -0.0527 -0.0526 +# Final train prob (xent) -0.5589 -0.7782 -0.7484 -0.7406 +# Final valid prob (xent) -0.6620 -0.8210 -0.7865 -0.7766 +# Num-params 9106252 4216524 4216524 4216524 + +# 1b22 is as 1b21 but setting chain.l2-regularize to zero. + +# 1b21 is as 1b20 but half the learning rate.. + +# 1b20 is as 1b19b but reducing dimensions of TDNN layers from 512 to 448. +# 1b19b is as 1b19 but with more epochs (4->6) +# 1b19 is a rerun of 1b18d3 (a fairly small LSTM+TDNN setup). +# +# +# 1b18d3 is as 1b18d2 but reducing lstm bottleneck dim from 304 to 256. +# [1b18d2 is just a rerun of 1b18d as I merged various code changes and +# I want to make sure nothing bad happened.] +# +# Results below show it's probably slightly better than the average of 18d and 18d2 +# (which are supposed to be the same experiment)... +# +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18d_sp exp/chain/tdnn_lstm1b18d2_sp exp/chain/tdnn_lstm1b18d3_sp +# System tdnn_lstm1b18d_sp tdnn_lstm1b18d2_sp tdnn_lstm1b18d3_sp +#WER dev93 (tgpr) 7.78 7.46 7.46 +#WER dev93 (tg) 7.29 7.30 7.04 +#WER dev93 (big-dict,tgpr) 5.56 5.51 5.55 +#WER dev93 (big-dict,fg) 5.32 5.08 5.05 +#WER eval92 (tgpr) 5.33 5.40 5.39 +#WER eval92 (tg) 5.05 5.03 4.96 +#WER eval92 (big-dict,tgpr) 3.42 3.26 3.35 +#WER eval92 (big-dict,fg) 2.91 2.64 2.82 +# Final train prob -0.0529 -0.0536 -0.0543 +# Final valid prob -0.0633 -0.0630 -0.0636 +# Final train prob (xent) -0.8327 -0.8330 -0.8415 +# Final valid prob (xent) -0.8693 -0.8672 -0.8695 +# Num-params 4922060 4922060 4805324 + +# +# 1b18d is as 1b18c, but adding 'self-scale=2.0' to scale up the m_trunc when it is given +# as input to the affine projections (I found previously this was helpful). +# .. Interesting: objf improves but WER is not better. +# +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18c_sp exp/chain/tdnn_lstm1b18d_sp +# System tdnn_lstm1b18c_sp tdnn_lstm1b18d_sp +#WER dev93 (tgpr) 7.77 7.78 +#WER dev93 (tg) 7.40 7.29 +#WER dev93 (big-dict,tgpr) 5.39 5.56 +#WER dev93 (big-dict,fg) 5.25 5.32 +#WER eval92 (tgpr) 5.48 5.33 +#WER eval92 (tg) 4.98 5.05 +#WER eval92 (big-dict,tgpr) 3.07 3.42 +#WER eval92 (big-dict,fg) 2.69 2.91 +# Final train prob -0.0546 -0.0529 +# Final valid prob -0.0641 -0.0633 +# Final train prob (xent) -0.8679 -0.8327 +# Final valid prob (xent) -0.8954 -0.8693 +# Num-params 4922060 4922060 + +# 1b18c is as 1b18b, but fixing a bug in the script whereby c instead of m had been used +# as input to the affine projections. + +# 1b18b is as 1b18, but doubling l2 regularization on the output +# and lstm layers, parts of them were training too slowly. +# +# 1b18 is as 1b17, but via script change, not using memory-norm (actually +# this is the same as 1b17d). +# I don't see any WER change, but objf is worse. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b17_sp exp/chain/tdnn_lstm1b17d_sp exp/chain/tdnn_lstm1b18_sp +# System tdnn_lstm1b17_sp tdnn_lstm1b17d_sp tdnn_lstm1b18_sp +#WER dev93 (tgpr) 7.49 7.44 7.48 +#WER dev93 (tg) 7.18 7.13 7.19 +#WER dev93 (big-dict,tgpr) 5.50 5.34 5.48 +#WER dev93 (big-dict,fg) 5.11 5.15 5.04 +#WER eval92 (tgpr) 5.26 5.32 5.32 +#WER eval92 (tg) 5.00 4.94 5.03 +#WER eval92 (big-dict,tgpr) 3.24 3.28 3.26 +#WER eval92 (big-dict,fg) 2.82 2.80 2.84 +# Final train prob -0.0489 -0.0486 -0.0496 +# Final valid prob -0.0583 -0.0599 -0.0612 +# Final train prob (xent) -0.7550 -0.7809 -0.7749 +# Final valid prob (xent) -0.7988 -0.8121 -0.8131 +# Num-params 4922060 4922060 4922060 + +# 1b17 is as 1b13m, it's just a rerun after some code changes (adding +# diagonal natural gradient stuff) which should make no difference. +# Still seems to be working. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp exp/chain/tdnn_lstm1b17_sp +# System tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp tdnn_lstm1b17_sp +#WER dev93 (tgpr) 7.86 7.43 7.49 +#WER dev93 (tg) 7.40 7.00 7.18 +#WER dev93 (big-dict,tgpr) 5.65 5.21 5.50 +#WER dev93 (big-dict,fg) 5.11 4.76 5.11 +#WER eval92 (tgpr) 5.64 5.39 5.26 +#WER eval92 (tg) 5.17 5.00 5.00 +#WER eval92 (big-dict,tgpr) 3.21 3.30 3.24 +#WER eval92 (big-dict,fg) 2.84 2.62 2.82 +# Final train prob -0.0469 -0.0516 -0.0489 +# Final valid prob -0.0601 -0.0607 -0.0583 +# Final train prob (xent) -0.7424 -0.7593 -0.7550 +# Final valid prob (xent) -0.7920 -0.7982 -0.7988 +# Num-params 5456076 4922060 4922060 + +# 1b13m is as 1b13l, but reverting the LSTM script "fix" (which actually +# made things worse), so the baseline is 1b13{c,d} (and the change versus +# c,d is to add bottleneck-dim=256). +# +# It's helpful: +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp +# System tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp +#WER dev93 (tgpr) 7.68 7.86 7.43 +#WER dev93 (tg) 7.34 7.40 7.00 +#WER dev93 (big-dict,tgpr) 5.42 5.65 5.21 +#WER dev93 (big-dict,fg) 5.05 5.11 4.76 +#WER eval92 (tgpr) 5.48 5.64 5.39 +#WER eval92 (tg) 5.26 5.17 5.00 +#WER eval92 (big-dict,tgpr) 3.23 3.21 3.30 +#WER eval92 (big-dict,fg) 2.82 2.84 2.62 +# Final train prob -0.0490 -0.0469 -0.0516 +# Final valid prob -0.0597 -0.0601 -0.0607 +# Final train prob (xent) -0.7549 -0.7424 -0.7593 +# Final valid prob (xent) -0.7910 -0.7920 -0.7982 +# Num-params 5456076 5456076 4922060 +# +# +# 1b13l is as 1b13k, but adding bottleneck-dim=256 to the output layers. +# Definitely helpful: + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13k_sp exp/chain/tdnn_lstm1b13l_sp +# System tdnn_lstm1b13k_sp tdnn_lstm1b13l_sp +#WER dev93 (tgpr) 7.94 7.46 +#WER dev93 (tg) 7.68 7.09 +#WER dev93 (big-dict,tgpr) 5.91 5.39 +#WER dev93 (big-dict,fg) 5.56 4.94 +#WER eval92 (tgpr) 5.65 5.44 +#WER eval92 (tg) 5.32 5.09 +#WER eval92 (big-dict,tgpr) 3.49 3.15 +#WER eval92 (big-dict,fg) 3.07 2.94 +# Final train prob -0.0491 -0.0513 +# Final valid prob -0.0600 -0.0599 +# Final train prob (xent) -0.7395 -0.7490 +# Final valid prob (xent) -0.7762 -0.7860 +# Num-params 5456076 4922060 + +# 1b13k is as 1b13d, but after a script fix: previously we were using the 'c' +# for the full-matrix part of the recurrence instead of the 'm'. + +# 1b13d is as 1b13c, but a rerun after fixing a code bug whereby the natural gradient +# for the LinearComponent was turned off by default when initializing from config. +# **Update: turns out there was no difference here, the code had been ignoring +# that config variable.** +# +# It seems to optimize better, although the WER change is unclear. However, it's +# interesting that the average objf in the individual training jobs (train.*.log) is not better- +# but in compute_prob_train.*.log it is. It seems that the natural gradient interacts +# well with model averaging, which is what we found previously in the NG paper. + + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp +# System tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp +#WER dev93 (tgpr) 7.68 7.86 +#WER dev93 (tg) 7.34 7.40 +#WER dev93 (big-dict,tgpr) 5.42 5.65 +#WER dev93 (big-dict,fg) 5.05 5.11 +#WER eval92 (tgpr) 5.48 5.64 +#WER eval92 (tg) 5.26 5.17 +#WER eval92 (big-dict,tgpr) 3.23 3.21 +#WER eval92 (big-dict,fg) 2.82 2.84 +# Final train prob -0.0490 -0.0469 +# Final valid prob -0.0597 -0.0601 +# Final train prob (xent) -0.7549 -0.7424 +# Final valid prob (xent) -0.7910 -0.7920 +# Num-params 5456076 5456076 +# +# +# 1b13c is as 1b13b, but after script change in which the lstmb layer was +# rewritten, adding memnorm and removing the scale of 4.0, along with some +# more minor changes and streamlining/removing options. +# +# 1b13b is as 1b13, but a rerun after merging with the memnorm-and-combine +# branch. Slight difference in num-params is because of 300 vs 304. + +# 1b13 is as 1b10 but reducing the bottleneck dim to 304 +# (because I want to get in the habit of using multiples of 8). +# WER seems improved. +# +# + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b10_sp exp/chain/tdnn_lstm1b13_sp +# System tdnn_lstm1b10_sp tdnn_lstm1b13_sp +#WER dev93 (tgpr) 7.87 7.63 +#WER dev93 (tg) 7.48 7.46 +#WER dev93 (big-dict,tgpr) 5.55 5.56 +#WER dev93 (big-dict,fg) 5.25 5.09 +#WER eval92 (tgpr) 5.44 5.48 +#WER eval92 (tg) 5.05 5.12 +#WER eval92 (big-dict,tgpr) 3.24 3.17 +#WER eval92 (big-dict,fg) 2.73 2.60 +# Final train prob -0.0463 -0.0470 +# Final valid prob -0.0561 -0.0565 +# Final train prob (xent) -0.7362 -0.7588 +# Final valid prob (xent) -0.7730 -0.7831 +# Num-params 5650636 5446348 + +# 1b10 is as 1b9 but reducing the cell and bottleneck dimension of LSTM layer from 512 to 384. +# Seems helpful on average-- nice! + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b9_sp exp/chain/tdnn_lstm1b10_sp +# System tdnn_lstm1b9_sp tdnn_lstm1b10_sp +#WER dev93 (tgpr) 7.74 7.87 +#WER dev93 (tg) 7.46 7.48 +#WER dev93 (big-dict,tgpr) 5.67 5.55 +#WER dev93 (big-dict,fg) 5.31 5.25 +#WER eval92 (tgpr) 5.60 5.44 +#WER eval92 (tg) 5.42 5.05 +#WER eval92 (big-dict,tgpr) 3.47 3.24 +#WER eval92 (big-dict,fg) 3.07 2.73 +# Final train prob -0.0413 -0.0463 +# Final valid prob -0.0543 -0.0561 +# Final train prob (xent) -0.6786 -0.7362 +# Final valid prob (xent) -0.7249 -0.7730 +# Num-params 7021644 5650636 + +# 1b9 is as 1b8 but adding batchnorm after the LSTM layer.. this is +# to correct an oversight. +# 1b8 is as 1b7 but with quite a few layers removed. WER effect is unclear. + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b7_sp exp/chain/tdnn_lstm1b8_sp +# System tdnn_lstm1b7_sp tdnn_lstm1b8_sp +#WER dev93 (tgpr) 7.31 7.60 +#WER dev93 (tg) 7.10 7.25 +#WER dev93 (big-dict,tgpr) 5.26 5.26 +#WER dev93 (big-dict,fg) 4.64 4.93 +#WER eval92 (tgpr) 5.48 5.32 +#WER eval92 (tg) 5.00 5.07 +#WER eval92 (big-dict,tgpr) 3.35 3.31 +#WER eval92 (big-dict,fg) 2.99 2.84 +# Final train prob -0.0483 -0.0533 +# Final valid prob -0.0573 -0.0627 +# Final train prob (xent) -0.7207 -0.8234 +# Final valid prob (xent) -0.7467 -0.8466 +# Num-params 11752524 7021644 + +# 1b7 is as 1b6 but adding self-stabilize=true and normalize-type=none; +# and after a script-level change that scale 'c' by 4 before giving it +# to the W_all_a matrix (to see where all this came from, look at run_tdnn_lstm_1b16.sh +# in the mini_librispeech setup, although by the time you see this, that may no longer exist). +# +# 1b6 is as 1b3 but replacing renorm with batchnorm for the TDNN layers, +# and adding batchnorm to the LSTMB layers. Effect on WER unclear but generally +# it's better. + + +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1{a2,a3,b3,b6}_sp +# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a2_sp exp/chain/tdnn_lstm1a3_sp exp/chain/tdnn_lstm1b3_sp exp/chain/tdnn_lstm1b6_sp +# System tdnn_lstm1a2_sp tdnn_lstm1a3_sp tdnn_lstm1b3_sp tdnn_lstm1b6_sp +#WER dev93 (tgpr) 7.47 7.65 7.26 7.32 +#WER dev93 (tg) 7.29 7.24 6.96 6.98 +#WER dev93 (big-dict,tgpr) 5.44 5.60 5.43 5.22 +#WER dev93 (big-dict,fg) 4.98 5.04 4.97 4.86 +#WER eval92 (tgpr) 5.78 5.21 5.30 5.14 +#WER eval92 (tg) 5.44 5.00 4.87 4.82 +#WER eval92 (big-dict,tgpr) 3.35 3.23 3.42 3.24 +#WER eval92 (big-dict,fg) 2.99 2.96 3.03 2.82 +# Final train prob -0.0447 -0.0410 -0.0484 -0.0503 +# Final valid prob -0.0566 -0.0518 -0.0594 -0.0599 +# Final train prob (xent) -0.6859 -0.6676 -0.7528 -0.7415 +# Final valid prob (xent) -0.7378 -0.7230 -0.8078 -0.7804 +# Num-params 9106252 9106252 11747916 11746380 + +# 1b3 is as 1a2 but with the same change as in a->b, replacing lstmp with lstmb +# 1a2 is as 1a but adding l2 regularization. + +# this is a TDNN+LSTM chain system. +# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with +# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh. +# Note: we're using the same hidden-layer sizes as +# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the +# fact that we'd normally choose a smaller model for a setup with +# less data, because the Tedlium model was probably on the small side. +# Note: we normally use more parameters for LSTM-containing than TDNN-only +# systems. + +# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp +# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051) + +# The following compares: +# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM) +# system. +# This is consistently better than the nnet3 TDNN+LSTM, but the +# difference with the chain TDNN is inconsistent. + +# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp +# System tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp +#WER dev93 (tgpr) 8.54 7.87 7.48 +# [online:] 8.57 8.02 7.49 +#WER dev93 (tg) 8.25 7.61 7.41 +# [online:] 8.34 7.70 7.40 +#WER dev93 (big-dict,tgpr) 6.24 5.71 5.64 +# [online:] 6.40 5.60 5.70 +#WER dev93 (big-dict,fg) 5.70 5.10 5.40 +# [online:] 5.77 5.21 5.19 +#WER eval92 (tgpr) 6.52 5.23 5.67 +# [online:] 6.56 5.44 5.60 +#WER eval92 (tg) 6.13 4.87 5.46 +# [online:] 6.24 4.87 5.53 +#WER eval92 (big-dict,tgpr) 3.88 3.24 3.69 +# [online:] 3.88 3.31 3.63 +#WER eval92 (big-dict,fg) 3.38 2.71 3.28 +# [online:] 3.53 2.92 3.31 +# Final train prob -0.0414 -0.0341 +# Final valid prob -0.0634 -0.0506 +# Final train prob (xent) -0.8216 -0.5643 +# Final valid prob (xent) -0.9208 -0.6648 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1b22c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +label_delay=8 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +chunk_left_context=40 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005 bottleneck-dim=256" + lstm_opts="l2-regularize=0.005 self-scale=2.0" + + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda delay=5 input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=448 + relu-batchnorm-layer name=tdnn2 $tdnn_opts dim=448 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $tdnn_opts dim=448 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 $tdnn_opts dim=448 input=Append(-3,0,3) + lstmb-layer name=lstm3 $lstm_opts cell-dim=384 bottleneck-dim=256 decay-time=20 delay=-3 + + ## adding the layers for chain branch + output-layer name=output input=lstm3 $output_opts output-delay=$label_delay include-log-softmax=false dim=$num_targets + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.deriv-truncate-margin=10 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=10 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l (s) < computation_.submatrices.size() && s>0); + int32 ans = computation_.commands.size(); + std::vector variable_indexes; + analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes); + std::vector::const_iterator iter = variable_indexes.begin(), + end = variable_indexes.end(); + for (; iter != end; ++iter) { + int32 v = *iter; + const std::vector &accesses = analyzer_.variable_accesses[v]; + if (!accesses.empty()) + ans = std::min(ans, accesses[0].command_index); + } + return ans; +} + + int32 ComputationAnalysis::FirstNontrivialMatrixAccess(int32 m) const { KALDI_ASSERT(static_cast(m) < computation_.matrices.size() && m > 0); int32 ans = computation_.commands.size(); diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h index 259a4546d53..a82cd4cb5b1 100644 --- a/src/nnet3/nnet-analyze.h +++ b/src/nnet3/nnet-analyze.h @@ -321,6 +321,13 @@ class ComputationAnalysis { /// s must be >0 (i.e. not the empty submatrix). int32 FirstNontrivialAccess(int32 s) const; + /// Returns the first command (read or write) that accesses any part of 's', + /// including possibly zeroing it. [note: kAllocMatrix, kSwapMatrix and + /// kDeallocMatrix do not count as read or write operations]. If there is no + /// such command, it returns num_commands. s must be >0 (i.e. not the empty + /// submatrix). + int32 FirstAccess(int32 s) const; + /// Returns the last non-deallocation command that accesses any part of /// submatrix 's'; if there is no such command it returns -1. /// s must be >0 (i.e. not the empty submatrix). @@ -385,7 +392,7 @@ struct CheckComputationOptions { // legitimately fail after optimization. see code for details. bool check_rewrite; // If 'check_unused_variables' is true, it checks for unused variables - // (e.g. unused partsof matrices). We only set it false for online + // (e.g. unused parts of matrices). We only set it false for online // computations, where there can be instances where a part of a matrix is // apparently never accessed (until we consider that the matrix is swapped // with another). diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index ded700dbbd8..2a0b2dcd499 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -3464,13 +3464,12 @@ class ComputationLoopedOptimizer { /// expected to be command indexes of the kNoOperationMarker at segment /// boundaries, this function outputs for each of these command indexes a list /// of matrices which are 'active' at that point in time. By 'active' we mean - /// that the matrix has been written to before that time (note, we don't count - /// initialization with zeros as being written to); and will be read after - /// that time. These is the list of matrices that 'need to be in scope' - /// at those points in time. '*active_matrices' is indexed by the - /// same index as 'splice_point_commands', and is then a list of active - /// matrices, in numerical order of matrix index. - /// Note: for each i, (*active_matrices)[i] will be sorted and unique. + /// that the matrix has been written to before that time (including zeroing), + /// and will be read after that time. These is the list of matrices that + /// 'need to be in scope' at those points in time. '*active_matrices' is + /// indexed by the same index as 'splice_point_commands', and is then a list + /// of active matrices, in numerical order of matrix index. Note: for each i, + /// (*active_matrices)[i] will be sorted and unique. static void FindActiveMatrices(const NnetComputation &computation, const Analyzer &analyzer, const std::vector &splice_point_commands, diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index e12cb7b1c42..0eb5de2c4fc 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -543,7 +543,9 @@ void Optimize(const NnetOptimizeOptions &config, CheckComputation(nnet, *computation, false); } - if (config.optimize && config.move_sizing_commands) { + + if ((config.optimize && config.move_sizing_commands) || + config.optimize_looped_computation) { MoveSizingCommands(nnet, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); From fd996017b8c9d4464f345ab56bc879b5ca25cb7a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 16 Jan 2018 22:28:33 -0500 Subject: [PATCH 069/184] [egs] Fix rt03 numbers to include swbd (thx: Gaofeng) --- .../s5c/local/chain/compare_wer_general.sh | 6 +- .../s5c/local/chain/tuning/run_tdnn_7m19b.sh | 368 ++++++++++++++++++ .../s5c/local/chain/tuning/run_tdnn_7m19c.sh | 19 +- 3 files changed, 374 insertions(+), 19 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh index fcd66d5d78d..6412a46e86a 100755 --- a/egs/swbd/s5c/local/chain/compare_wer_general.sh +++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh @@ -132,7 +132,7 @@ if $include_rt03; then echo -n "# WER on rt03(tg) " for x in $*; do set_names $x - wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo @@ -141,7 +141,7 @@ if $include_rt03; then echo -n "# [looped:] " for x in $*; do set_names $x - wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo @@ -150,7 +150,7 @@ if $include_rt03; then echo -n "# WER on rt03(fg) " for x in $*; do set_names $x - wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh new file mode 100755 index 00000000000..aaebe038e99 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh @@ -0,0 +1,368 @@ +#!/bin/bash +# TODO: this will be moved before merging to master. + +# 7m19b is as 7m19 but with some bypass connections. Helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 19.1 19.0 +# WER on rt03(fg) 16.6 16.4 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m19b +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn3 $opts dim=1280 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn5 $opts dim=1280 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3,tdnn4) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3,tdnn6) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3,tdnn8) dim=1280 bottleneck-dim=256 + + relu-batchnorm-layer name=prefinal-chain input=tdnn10 $opts dim=1280 bottleneck-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn10 $opts dim=1280 bottleneck-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh index 8cc029744a1..5fe29ac3562 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh @@ -2,7 +2,7 @@ # Note: before merging to master, this will be renamed. # 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). -# Seems about 0.1% better. +# Effect is unclear. # local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp # System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp @@ -10,27 +10,14 @@ # WER on train_dev(fg) 12.12 11.87 11.82 # WER on eval2000(tg) 15.8 15.6 15.4 # WER on eval2000(fg) 14.3 14.0 13.8 -# WER on rt03(tg) 14.8 14.9 14.8 -# WER on rt03(fg) 12.4 12.5 12.5 +# WER on rt03(tg) 19.1 19.0 19.1 +# WER on rt03(fg) 16.6 16.4 16.6 # Final train prob -0.096 -0.096 -0.094 # Final valid prob -0.106 -0.106 -0.103 # Final train prob (xent) -1.198 -1.188 -1.117 # Final valid prob (xent) -1.2070 -1.1980 -1.1223 # Num-parameters 15528996 16512036 17824036 -# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp -# System tdnn7m19_sp tdnn7m19b_sp -# WER on train_dev(tg) 13.09 12.93 -# WER on train_dev(fg) 12.12 11.87 -# WER on eval2000(tg) 15.8 15.6 -# WER on eval2000(fg) 14.3 14.0 -# WER on rt03(tg) 14.8 14.9 -# WER on rt03(fg) 12.4 12.5 -# Final train prob -0.096 -0.096 -# Final valid prob -0.106 -0.106 -# Final train prob (xent) -1.198 -1.188 -# Final valid prob (xent) -1.2070 -1.1980 -# Num-parameters 15528996 16512036 # 7m19 is as 7m16 but adding an extra -3,0,3 layer. # CAUTION: messing with queue opts. From b7691d466283fa733bdeeb2006300fd67e4b4c9d Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 18 Jan 2018 19:12:27 -0500 Subject: [PATCH 070/184] [egs] Add new tuning script (not in its final place) --- .../s5c/local/chain/tuning/run_tdnn_7m19b.sh | 26 +- .../s5c/local/chain/tuning/run_tdnn_7m19h.sh | 428 ++++++++++++++++++ 2 files changed, 441 insertions(+), 13 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh index aaebe038e99..fdc4b63d59b 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh @@ -3,19 +3,19 @@ # 7m19b is as 7m19 but with some bypass connections. Helpful. -# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp -# System tdnn7m19_sp tdnn7m19b_sp -# WER on train_dev(tg) 13.09 12.93 -# WER on train_dev(fg) 12.12 11.87 -# WER on eval2000(tg) 15.8 15.6 -# WER on eval2000(fg) 14.3 14.0 -# WER on rt03(tg) 19.1 19.0 -# WER on rt03(fg) 16.6 16.4 -# Final train prob -0.096 -0.096 -# Final valid prob -0.106 -0.106 -# Final train prob (xent) -1.198 -1.188 -# Final valid prob (xent) -1.2070 -1.1980 -# Num-parameters 15528996 16512036 +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m10_sp tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.77 13.09 12.93 +# WER on train_dev(fg) 12.65 12.12 11.87 +# WER on eval2000(tg) 16.1 15.8 15.6 +# WER on eval2000(fg) 14.3 14.3 14.0 +# WER on rt03(tg) 19.9 19.1 19.0 +# WER on rt03(fg) 17.4 16.6 16.4 +# Final train prob -0.111 -0.096 -0.096 +# Final valid prob -0.120 -0.106 -0.106 +# Final train prob (xent) -1.314 -1.198 -1.188 +# Final valid prob (xent) -1.3247 -1.2070 -1.1980 +# Num-parameters 13361700 15528996 16512036 # 7m19 is as 7m16 but adding an extra -3,0,3 layer. diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh new file mode 100755 index 00000000000..b509517da68 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh @@ -0,0 +1,428 @@ +#!/bin/bash + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m19h +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1536 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn3 $opts dim=1536 bottleneck-dim=192 + relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4, tdnn2) bottleneck-dim=192 + relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9) dim=1536 bottleneck-dim=256 + + relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 bottleneck-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 bottleneck-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; From 85a2c60e3697e54711e33fbff854f218597745c7 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sat, 20 Jan 2018 21:34:27 +0330 Subject: [PATCH 071/184] [src] Print informative error if num-ceps >= num-mel-bins in MFCC (#2166) --- src/feat/feature-mfcc.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc index c1962a5c1d1..122ba1b100d 100644 --- a/src/feat/feature-mfcc.cc +++ b/src/feat/feature-mfcc.cc @@ -82,7 +82,14 @@ void MfccComputer::Compute(BaseFloat signal_log_energy, MfccComputer::MfccComputer(const MfccOptions &opts): opts_(opts), srfft_(NULL), mel_energies_(opts.mel_opts.num_bins) { + int32 num_bins = opts.mel_opts.num_bins; + if (opts.num_ceps > num_bins) + KALDI_ERR << "num-ceps cannot be larger than num-mel-bins." + << " It should be smaller or equal. You provided num-ceps: " + << opts.num_ceps << " and num-mel-bins: " + << num_bins; + Matrix dct_matrix(num_bins, num_bins); ComputeDctMatrix(&dct_matrix); // Note that we include zeroth dct in either case. If using the From 35950ea2461f63e7de9423456c13abb22a396ac7 Mon Sep 17 00:00:00 2001 From: Xingyu Na Date: Mon, 22 Jan 2018 02:50:14 +0800 Subject: [PATCH 072/184] [egs] add speaker recognition recipe for aishell (#2111) --- egs/aishell/README.txt | 4 + egs/aishell/v1/README.txt | 8 ++ egs/aishell/v1/cmd.sh | 15 +++ egs/aishell/v1/conf/mfcc.conf | 3 + egs/aishell/v1/conf/vad.conf | 2 + egs/aishell/v1/local/aishell_data_prep.sh | 63 +++++++++++ egs/aishell/v1/local/download_and_untar.sh | 105 ++++++++++++++++++ egs/aishell/v1/local/produce_trials.py | 35 ++++++ .../v1/local/split_data_enroll_eval.py | 37 ++++++ egs/aishell/v1/path.sh | 5 + egs/aishell/v1/run.sh | 94 ++++++++++++++++ egs/aishell/v1/sid | 1 + egs/aishell/v1/steps | 1 + egs/aishell/v1/utils | 1 + 14 files changed, 374 insertions(+) create mode 100644 egs/aishell/v1/README.txt create mode 100644 egs/aishell/v1/cmd.sh create mode 100644 egs/aishell/v1/conf/mfcc.conf create mode 100644 egs/aishell/v1/conf/vad.conf create mode 100755 egs/aishell/v1/local/aishell_data_prep.sh create mode 100755 egs/aishell/v1/local/download_and_untar.sh create mode 100755 egs/aishell/v1/local/produce_trials.py create mode 100755 egs/aishell/v1/local/split_data_enroll_eval.py create mode 100755 egs/aishell/v1/path.sh create mode 100755 egs/aishell/v1/run.sh create mode 120000 egs/aishell/v1/sid create mode 120000 egs/aishell/v1/steps create mode 120000 egs/aishell/v1/utils diff --git a/egs/aishell/README.txt b/egs/aishell/README.txt index 0dcea0977cc..f37e6dfaeb5 100644 --- a/egs/aishell/README.txt +++ b/egs/aishell/README.txt @@ -5,5 +5,9 @@ Aishell is an open Chinese Mandarin speech database published by Beijing Shell S The database can be downloaded from openslr: http://www.openslr.org/33/ +This folder contains two subfolders: +s5: a speech recognition recipe +v1: a speaker recognition recipe + For more details, please visit: http://www.aishelltech.com/kysjcp diff --git a/egs/aishell/v1/README.txt b/egs/aishell/v1/README.txt new file mode 100644 index 00000000000..43e26d0817d --- /dev/null +++ b/egs/aishell/v1/README.txt @@ -0,0 +1,8 @@ +The database can be downloaded from openslr: +http://www.openslr.org/33/ + +For more details, please visit: +http://www.aishelltech.com/kysjcp + +We use the training set to train model, +and split the test set into enroll and eval. diff --git a/egs/aishell/v1/cmd.sh b/egs/aishell/v1/cmd.sh new file mode 100644 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/aishell/v1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/aishell/v1/conf/mfcc.conf b/egs/aishell/v1/conf/mfcc.conf new file mode 100644 index 00000000000..f40379a0803 --- /dev/null +++ b/egs/aishell/v1/conf/mfcc.conf @@ -0,0 +1,3 @@ +--sample-frequency=16000 +--num-mel-bins=40 #higher than the default which is 23 +--num-ceps=20 # higher than the default which is 12. diff --git a/egs/aishell/v1/conf/vad.conf b/egs/aishell/v1/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/aishell/v1/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/aishell/v1/local/aishell_data_prep.sh b/egs/aishell/v1/local/aishell_data_prep.sh new file mode 100755 index 00000000000..70d6ba1f3e5 --- /dev/null +++ b/egs/aishell/v1/local/aishell_data_prep.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +. ./path.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript" + exit 1; +fi + +aishell_audio_dir=$1 +aishell_text_dir=$2 + +train_dir=data/local/train +dev_dir=data/local/dev +test_dir=data/local/test + +mkdir -p $train_dir +mkdir -p $dev_dir +mkdir -p $test_dir + +# data directory check +if [ ! -d $aishell_audio_dir ] || [ ! -d $aishell_text_dir ]; then + echo "Error: $0 requires two directory arguments" + exit 1; +fi + +# find wav audio file for train, dev and test resp. +find $aishell_audio_dir -iname "*.wav" | grep -i "wav/train" > $train_dir/wav.flist || exit 1; +find $aishell_audio_dir -iname "*.wav" | grep -i "wav/dev" > $dev_dir/wav.flist || exit 1; +find $aishell_audio_dir -iname "*.wav" | grep -i "wav/test" > $test_dir/wav.flist || exit 1; + +n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l` +[ $n -ne 141925 ] && \ + echo Warning: expected 141925 data data files, found $n + +# Transcriptions preparation +for dir in $train_dir $test_dir; do + echo Preparing $dir transcriptions + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' |\ + sort > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' |\ + sort > $dir/utt2spk_all + paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all + utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt + awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk + utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp + sort -u $dir/transcripts.txt > $dir/text + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +done + +mkdir -p data/train data/test +for f in spk2utt utt2spk wav.scp text; do + cp $train_dir/$f data/train/$f || exit 1; + cp $test_dir/$f data/test/$f || exit 1; +done + +echo "$0: AISHELL data preparation succeeded" +exit 0; diff --git a/egs/aishell/v1/local/download_and_untar.sh b/egs/aishell/v1/local/download_and_untar.sh new file mode 100755 index 00000000000..0189bad1d4a --- /dev/null +++ b/egs/aishell/v1/local/download_and_untar.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_aishell, resource." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_aishell resource" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="15582913665 1246920" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.gz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +if [ $part == "data_aishell" ]; then + cd $data/$part/wav + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/egs/aishell/v1/local/produce_trials.py b/egs/aishell/v1/local/produce_trials.py new file mode 100755 index 00000000000..d01f7eb7da3 --- /dev/null +++ b/egs/aishell/v1/local/produce_trials.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# Copyright 2017 Bengu Wu +# Apache 2.0. + +# This script generate trials file. +# Trial file is formatted as: +# uttid spkid target|nontarget + +# If uttid belong to spkid, it is marked 'target', +# otherwise is 'nontarget'. +# input: eval set uttspk file +# output: trial file + +import sys + +fnutt = sys.argv[1] +ftrial = open(sys.argv[2], 'w') + +dictutt = {} +for line in open(fnutt): + utt2spk = line.rstrip('\r\t\n ') + spk = utt2spk.split(' ')[1] + if spk not in dictutt: + dictutt[spk] = spk + +for line in open(fnutt): + utt2spk = line.rstrip('\r\t\n ') + utt, spk = utt2spk.split(' ') + for target in dictutt: + if target == spk: + trial = utt + ' ' + target + ' target' + else: + trial = utt + ' ' + target + ' nontarget' + ftrial.write(trial + '\n') +ftrial.close() diff --git a/egs/aishell/v1/local/split_data_enroll_eval.py b/egs/aishell/v1/local/split_data_enroll_eval.py new file mode 100755 index 00000000000..7aa45121c17 --- /dev/null +++ b/egs/aishell/v1/local/split_data_enroll_eval.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Bengu Wu +# Apache 2.0. + +# This script splits the test set utt2spk into enroll set and eval set +# For each speaker, 3 utterances are randomly selected as enroll samples, +# and the others are used as eval samples for evaluation +# input: test utt2spk +# output: enroll utt2spk, eval utt2spk + +import sys,random + +dictutt = {} + +for line in open(sys.argv[1]): + line = line.rstrip('\r\t\n ') + utt, spk = line.split(' ') + if spk not in dictutt: + dictutt[spk] = [] + dictutt[spk].append(utt) + +fenroll = open(sys.argv[2], 'w') +feval = open(sys.argv[3], 'w') + +for key in dictutt: + utts = dictutt[key] + random.shuffle(utts) + for i in range(0, len(utts)): + line = utts[i] + ' ' + key + if(i < 3): + fenroll.write(line + '\n') + else: + feval.write(line + '\n') + +fenroll.close() +feval.close() diff --git a/egs/aishell/v1/path.sh b/egs/aishell/v1/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/aishell/v1/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/aishell/v1/run.sh b/egs/aishell/v1/run.sh new file mode 100755 index 00000000000..0aaa6d493d6 --- /dev/null +++ b/egs/aishell/v1/run.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Copyright 2017 Beijing Shell Shell Tech. Co. Ltd. (Authors: Hui Bu) +# 2017 Jiayu Du +# 2017 Chao Li +# 2017 Xingyu Na +# 2017 Bengu Wu +# 2017 Hao Zheng +# Apache 2.0 + +# This is a shell script that we demonstrate speech recognition using AIShell-1 data. +# it's recommended that you run the commands one by one by copying and pasting into the shell. +# See README.txt for more info on data required. +# Results (EER) are inline in comments below + +data=/export/a05/xna/data +data_url=www.openslr.org/resources/33 + +. ./cmd.sh +. ./path.sh + +set -e # exit on error + +local/download_and_untar.sh $data $data_url data_aishell +local/download_and_untar.sh $data $data_url resource_aishell + +# Data Preparation +local/aishell_data_prep.sh $data/data_aishell/wav $data/data_aishell/transcript + +# Now make MFCC features. +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc +for x in train test; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir + sid/compute_vad_decision.sh --nj 10 --cmd "$train_cmd" data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x +done + +# train diag ubm +sid/train_diag_ubm.sh --nj 10 --cmd "$train_cmd" --num-threads 16 \ + data/train 1024 exp/diag_ubm_1024 + +#train full ubm +sid/train_full_ubm.sh --nj 10 --cmd "$train_cmd" data/train \ + exp/diag_ubm_1024 exp/full_ubm_1024 + +#train ivector +sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 10G" \ + --num-iters 5 exp/full_ubm_1024/final.ubm data/train \ + exp/extractor_1024 + +#extract ivector +sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \ + exp/extractor_1024 data/train exp/ivector_train_1024 + +#train plda +$train_cmd exp/ivector_train_1024/log/plda.log \ + ivector-compute-plda ark:data/train/spk2utt \ + 'ark:ivector-normalize-length scp:exp/ivector_train_1024/ivector.scp ark:- |' \ + exp/ivector_train_1024/plda + +#split the test to enroll and eval +mkdir -p data/test/enroll data/test/eval +cp data/test/{spk2utt,feats.scp,vad.scp} data/test/enroll +cp data/test/{spk2utt,feats.scp,vad.scp} data/test/eval +local/split_data_enroll_eval.py data/test/utt2spk data/test/enroll/utt2spk data/test/eval/utt2spk +trials=data/test/aishell_speaker_ver.lst +local/produce_trials.py data/test/eval/utt2spk $trials +utils/fix_data_dir.sh data/test/enroll +utils/fix_data_dir.sh data/test/eval + +#extract enroll ivector +sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \ + exp/extractor_1024 data/test/enroll exp/ivector_enroll_1024 +#extract eval ivector +sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \ + exp/extractor_1024 data/test/eval exp/ivector_eval_1024 + +#compute plda score +$train_cmd exp/ivector_eval_1024/log/plda_score.log \ + ivector-plda-scoring --num-utts=ark:exp/ivector_enroll_1024/num_utts.ark \ + exp/ivector_train_1024/plda \ + ark:exp/ivector_enroll_1024/spk_ivector.ark \ + "ark:ivector-normalize-length scp:exp/ivector_eval_1024/ivector.scp ark:- |" \ + "cat '$trials' | awk '{print \\\$2, \\\$1}' |" exp/trials_out + +#compute eer +awk '{print $3}' exp/trials_out | paste - $trials | awk '{print $1, $4}' | compute-eer - + +# Result +# Scoring against data/test/aishell_speaker_ver.lst +# Equal error rate is 0.140528%, at threshold -12.018 + +exit 0 diff --git a/egs/aishell/v1/sid b/egs/aishell/v1/sid new file mode 120000 index 00000000000..893a12f30c9 --- /dev/null +++ b/egs/aishell/v1/sid @@ -0,0 +1 @@ +../../sre08/v1/sid \ No newline at end of file diff --git a/egs/aishell/v1/steps b/egs/aishell/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/aishell/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/aishell/v1/utils b/egs/aishell/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/aishell/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From d6391f8640f1f92fe27340e87ad797abfb76da3d Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Mon, 22 Jan 2018 02:12:33 -0500 Subject: [PATCH 073/184] [scripts,egs] Fix bug in slurm.pl (thx:@kamo-naoyuki), remove outdated results. --- egs/fisher_swbd/s5/RESULTS | 40 ------------------------------ egs/wsj/s5/utils/parallel/slurm.pl | 4 +-- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/egs/fisher_swbd/s5/RESULTS b/egs/fisher_swbd/s5/RESULTS index 27de4757966..14a00465b51 100644 --- a/egs/fisher_swbd/s5/RESULTS +++ b/egs/fisher_swbd/s5/RESULTS @@ -40,46 +40,6 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_eval2000*_fg; do grep Sum $x/ %WER 12.3 | 1831 21395 | 89.2 7.2 3.5 1.5 12.3 50.8 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_fsh_sw1_fg/score_13/eval2000.ctm.swbd.filt.sys %WER 11.8 | 1831 21395 | 89.6 7.2 3.2 1.4 11.8 49.0 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_offline_fsh_sw1_fg/score_11/eval2000.ctm.swbd.filt.sys -# nnet3 result on eval2000 -# BLSTM ran for about 760 hours, command: -# local/nnet3/run_lstm.sh --affix bidirectional --lstm-delay " [-1,1] [-2,2] [-3,3] " --label-delay 0 \ -# --cell-dim 1024 --recurrent-projection-dim 128 --non-recurrent-projection-dim 128 \ -# --chunk-left-context 40 --chunk-right-context 40 \ -# --extra-left-context 50 --extra-right-context 50 -# use tri-gram -for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done -%WER 15.8 | 4459 42989 | 86.1 9.7 4.1 1.9 15.8 52.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys -%WER 14.8 | 4459 42989 | 87.2 9.4 3.4 2.1 14.8 52.2 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_13_0.0/eval2000_hires.ctm.filt.sys -%WER 14.8 | 4459 42989 | 86.6 9.2 4.3 1.4 14.8 54.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys -# rescore with four-gram -for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done -%WER 15.4 | 4459 42989 | 86.4 9.5 4.0 1.8 15.4 51.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.filt.sys -%WER 14.5 | 4459 42989 | 87.5 9.0 3.5 2.0 14.5 51.4 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.filt.sys -%WER 14.5 | 4459 42989 | 87.0 9.0 4.0 1.5 14.5 53.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys - -# nnet3 result on eval2000 for swbd subset -# use tri-gram -for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done -%WER 11.6 | 1831 21395 | 89.7 7.3 3.0 1.3 11.6 47.7 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys -%WER 10.3 | 1831 21395 | 91.0 6.4 2.5 1.3 10.3 45.9 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_19_0.0/eval2000_hires.ctm.swbd.filt.sys -%WER 10.7 | 1831 21395 | 90.3 6.7 3.0 1.0 10.7 45.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys -# rescore with four-gram -for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done -%WER 11.1 | 1831 21395 | 90.2 7.0 2.8 1.3 11.1 46.2 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys -%WER 10.0 | 1831 21395 | 91.3 6.3 2.4 1.3 10.0 45.1 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_19_1.0/eval2000_hires.ctm.swbd.filt.sys -%WER 10.4 | 1831 21395 | 90.6 6.5 2.9 1.0 10.4 45.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys - -# nnet3 result on eval2000 for callhm subset -# use tri-gram -for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done -%WER 19.9 | 2628 21594 | 82.6 12.1 5.3 2.6 19.9 56.0 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys -%WER 19.0 | 2628 21594 | 83.5 11.7 4.8 2.5 19.0 56.5 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_14_0.5/eval2000_hires.ctm.callhm.filt.sys -%WER 18.8 | 2628 21594 | 83.1 11.7 5.2 1.9 18.8 60.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys -# rescore with four-gram -for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done -%WER 19.7 | 2628 21594 | 82.7 12.1 5.2 2.4 19.7 55.3 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys -%WER 18.7 | 2628 21594 | 83.7 11.5 4.8 2.5 18.7 55.6 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.callhm.filt.sys -%WER 18.6 | 2628 21594 | 83.3 11.5 5.2 1.9 18.6 59.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys # chain result on eval2000 # BLSTM ran for about 380 hours diff --git a/egs/wsj/s5/utils/parallel/slurm.pl b/egs/wsj/s5/utils/parallel/slurm.pl index baad329c937..919b32045a2 100755 --- a/egs/wsj/s5/utils/parallel/slurm.pl +++ b/egs/wsj/s5/utils/parallel/slurm.pl @@ -394,7 +394,7 @@ sub exec_command { print Q "if [ \"\$CUDA_VISIBLE_DEVICES\" == \"NoDevFiles\" ]; then\n"; print Q " ( echo CUDA_VISIBLE_DEVICES set to NoDevFiles, unsetting it... \n"; print Q " )>>$logfile\n"; -print Q " unset CUDA_VISIBLE_DEVICES.\n"; +print Q " unset CUDA_VISIBLE_DEVICES\n"; print Q "fi\n"; print Q "time1=\`date +\"%s\"\`\n"; print Q " ( $cmd ) &>>$logfile\n"; @@ -506,7 +506,7 @@ sub exec_command { if ($squeue_status == 1) { # time to make sure it is not just delayed creation of the syncfile. - # Don't consider immediately missing job as error, first wait some + # Don't consider immediately missing job as error, first wait some # time to make sure it is not just delayed creation of the syncfile. sleep(4); # Sometimes NFS gets confused and thinks it's transmitted the directory From c3dd60f9aa9b1e126923916c281ff2e31a3a8bc4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 22 Jan 2018 17:59:09 -0500 Subject: [PATCH 074/184] [egs] Adding some of the more interesting tuning experiments; add xconfig layer for linear-component --- .../s5c/local/chain/tuning/run_tdnn_7m19m.sh | 473 +++++++++++++++++ .../s5c/local/chain/tuning/run_tdnn_7m23b.sh | 482 +++++++++++++++++ .../s5c/local/chain/tuning/run_tdnn_7m23b2.sh | 501 ++++++++++++++++++ egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 3 +- .../libs/nnet3/xconfig/trivial_layers.py | 76 +++ 5 files changed, 1534 insertions(+), 1 deletion(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh new file mode 100755 index 00000000000..ed533b7da29 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh @@ -0,0 +1,473 @@ +#!/bin/bash + +# 7m19m is as 7m19l but with more skip connections +# Hm-- seems better than 19h. +# +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# System tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# WER on train_dev(tg) 12.61 12.72 12.55 +# WER on train_dev(fg) 11.72 11.62 11.52 +# WER on eval2000(tg) 15.4 15.4 15.2 +# WER on eval2000(fg) 13.7 13.8 13.6 +# WER on rt03(tg) 18.9 18.9 18.6 +# WER on rt03(fg) 16.3 16.4 16.2 +# Final train prob -0.091 -0.091 -0.089 +# Final valid prob -0.102 -0.103 -0.101 +# Final train prob (xent) -1.098 -1.095 -1.080 +# Final valid prob (xent) -1.1031 -1.1191 -1.0990 +# Num-parameters 21055012 20268580 21055012 +# +# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before +# doing the Append... doing this by inserting a linear-component between +# pairs of relu-batchnorm-layers. +# A little worse. +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp +# System tdnn7m19h_sp tdnn7m19l_sp +# WER on train_dev(tg) 12.65 12.72 +# WER on train_dev(fg) 11.57 11.62 +# WER on eval2000(tg) 15.3 15.4 +# WER on eval2000(fg) 13.7 13.8 +# WER on rt03(tg) 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.091 -0.091 +# Final valid prob -0.102 -0.103 +# Final train prob (xent) -1.091 -1.095 +# Final valid prob (xent) -1.1064 -1.1191 +# Num-parameters 21055012 20268580 + + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m19m +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1536 bottleneck-dim=192 + linear-component name=tdnn1l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn3 $opts dim=1536 bottleneck-dim=192 + linear-component name=tdnn3l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256 + relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4, tdnn2) bottleneck-dim=192 + linear-component name=tdnn5l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256 + linear-component name=tdnn6l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536 bottleneck-dim=256 + linear-component name=tdnn7l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256 + linear-component name=tdnn8l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536 bottleneck-dim=256 + linear-component name=tdnn9l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256 + linear-component name=tdnn10l dim=512 $linear_opts + relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536 bottleneck-dim=256 + + relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 bottleneck-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 bottleneck-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh new file mode 100755 index 00000000000..7b0f45e6899 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the +# splicing in 2 stages... +# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and +# reducing the dim of the linear components... it's basically an attempt to +# reverse the factorization to have the splicing at a different point. +# + +# 7m19m is as 7m19l but with more skip connections +# Hm-- seems better than 19h. +# +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# System tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# WER on train_dev(tg) 12.61 12.72 12.55 +# WER on train_dev(fg) 11.72 11.62 11.52 +# WER on eval2000(tg) 15.4 15.4 15.2 +# WER on eval2000(fg) 13.7 13.8 13.6 +# WER on rt03(tg) 18.9 18.9 18.6 +# WER on rt03(fg) 16.3 16.4 16.2 +# Final train prob -0.091 -0.091 -0.089 +# Final valid prob -0.102 -0.103 -0.101 +# Final train prob (xent) -1.098 -1.095 -1.080 +# Final valid prob (xent) -1.1031 -1.1191 -1.0990 +# Num-parameters 21055012 20268580 21055012 +# +# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before +# doing the Append... doing this by inserting a linear-component between +# pairs of relu-batchnorm-layers. +# A little worse. +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp +# System tdnn7m19h_sp tdnn7m19l_sp +# WER on train_dev(tg) 12.65 12.72 +# WER on train_dev(fg) 11.57 11.62 +# WER on eval2000(tg) 15.3 15.4 +# WER on eval2000(fg) 13.7 13.8 +# WER on rt03(tg) 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.091 -0.091 +# Final valid prob -0.102 -0.103 +# Final train prob (xent) -1.091 -1.095 +# Final valid prob (xent) -1.1064 -1.1191 +# Num-parameters 21055012 20268580 + + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m23b +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1536 + linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn2l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1536 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn4l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) bottleneck-dim=192 + linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536 + + relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh new file mode 100755 index 00000000000..9f943cf7d4d --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh @@ -0,0 +1,501 @@ +#!/bin/bash + +# 7m23b2 is as 7m23b but fixing an issue at the last layers. +# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the +# splicing in 2 stages. Interestingly, objf is not better than 23, but +# WER is slightly better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# System tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# WER on train_dev(tg) 12.55 12.23 12.38 +# WER on train_dev(fg) 11.52 11.29 11.44 +# WER on eval2000(tg) 15.2 15.2 15.1 +# WER on eval2000(fg) 13.6 13.7 13.6 +# WER on rt03(tg) 18.6 18.7 18.4 +# WER on rt03(fg) 16.2 16.3 16.1 +# Final train prob -0.089 -0.083 -0.084 +# Final valid prob -0.101 -0.097 -0.098 +# Final train prob (xent) -1.080 -1.025 -1.049 +# Final valid prob (xent) -1.0990 -1.0548 -1.0661 +# Num-parameters 21055012 23120164 23120164 + + +# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and +# reducing the dim of the linear components... it's basically an attempt to +# reverse the factorization to have the splicing at a different point. +# + +# 7m19m is as 7m19l but with more skip connections +# Hm-- seems better than 19h. +# +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# System tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# WER on train_dev(tg) 12.61 12.72 12.55 +# WER on train_dev(fg) 11.72 11.62 11.52 +# WER on eval2000(tg) 15.4 15.4 15.2 +# WER on eval2000(fg) 13.7 13.8 13.6 +# WER on rt03(tg) 18.9 18.9 18.6 +# WER on rt03(fg) 16.3 16.4 16.2 +# Final train prob -0.091 -0.091 -0.089 +# Final valid prob -0.102 -0.103 -0.101 +# Final train prob (xent) -1.098 -1.095 -1.080 +# Final valid prob (xent) -1.1031 -1.1191 -1.0990 +# Num-parameters 21055012 20268580 21055012 +# +# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before +# doing the Append... doing this by inserting a linear-component between +# pairs of relu-batchnorm-layers. +# A little worse. +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp +# System tdnn7m19h_sp tdnn7m19l_sp +# WER on train_dev(tg) 12.65 12.72 +# WER on train_dev(fg) 11.57 11.62 +# WER on eval2000(tg) 15.3 15.4 +# WER on eval2000(fg) 13.7 13.8 +# WER on rt03(tg) 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.091 -0.091 +# Final valid prob -0.102 -0.103 +# Final train prob (xent) -1.091 -1.095 +# Final valid prob (xent) -1.1064 -1.1191 +# Num-parameters 21055012 20268580 + + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m23b2 +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1536 + linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn2l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1536 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn4l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) bottleneck-dim=192 + linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536 + linear-component name=tdnn11l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1536 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1536 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index c6b0619bca8..6fbde1fbbcc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -69,7 +69,8 @@ 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, 'renorm-component': xlayers.XconfigRenormComponent, - 'no-op-component': xlayers.XconfigNoOpComponent + 'no-op-component': xlayers.XconfigNoOpComponent, + 'linear-component': xlayers.XconfigLinearComponent } # Turn a config line and a list of previous layers into diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index ef05887e469..63f6278d1ca 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -121,3 +121,79 @@ def _generate_config(self): self.name, input_desc)) configs.append(line) return configs + + +class XconfigLinearComponent(XconfigLayerBase): + """This class is for parsing lines like + 'linear-component name=linear1 dim=1024 input=Append(-3,0,3)' + which will produce just a single component, of type LinearComponent, with + output-dim 1024 in this case, and input-dim determined by the dimention + of the input . + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + dim=-1 [Dimension of the output] + + The following (shown with their effective defaults) are just passed through + to the component's config line. + + orthonormal-constraint=-1 + max-change=0.75 + l2-regularize=0.0 + + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'dim': -1, + 'orthonormal-constraint': '', + 'max-change': 0.75, + 'l2-regularize': '' } + + def check_configs(self): + if self.config['dim'] <= 0: + raise RuntimeError("'dim' must be specified and > 0.") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + assert self.config['dim'] > 0 + return self.config['dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + + opts = '' + for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']: + value = self.config[opt_name] + if value != '': + opts += ' {0}={1}'.format(opt_name, value) + + configs = [] + line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} ' + '{3}'.format(self.name, input_dim, output_dim, opts)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs From 96cbdd6974558fbee0e54bbe5474a61b7b52c237 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 23 Jan 2018 11:26:28 +0800 Subject: [PATCH 075/184] [egs] Update chain results and add new scripts for fisher_swbd (related to #2136) (#2173) --- .../s5/local/chain/run_blstm_6j.sh | 60 ++-- .../s5/local/chain/run_tdnn_lstm_1a.sh | 6 + .../s5/local/chain/run_tdnn_lstm_1b.sh | 310 ++++++++++++++++++ .../s5/local/chain/run_tdnn_opgru_1a.sh | 62 ++-- .../s5/local/chain/run_tdnn_opgru_1b.sh | 308 +++++++++++++++++ .../s5/local/chain/show_chain_wer.sh | 19 -- .../s5/local/chain/show_chain_wer_rt03.sh | 20 -- 7 files changed, 700 insertions(+), 85 deletions(-) create mode 100644 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh create mode 100644 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh delete mode 100644 egs/fisher_swbd/s5/local/chain/show_chain_wer.sh delete mode 100644 egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh index 9810a03ee58..03d362ef552 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh @@ -5,31 +5,37 @@ # The model training procedure is similar to run_blstm_6j.sh under egs/swbd/s5c # ./local/chain/compare_wer_general.sh blstm_6j_sp -# System blstm_6j_sp -# WER on eval2000(tg) 12.1 -# WER on eval2000(fg) 11.9 -# WER on rt03(tg) 11.9 -# WER on rt03(fg) 11.6 -# Final train prob -0.059 -# Final valid prob -0.072 -# Final train prob (xent) -0.711 -# Final valid prob (xent) -0.7782 +# System blstm_6j_sp +# WER on eval2000(tg) 12.3 +# WER on eval2000(fg) 12.2 +# WER on rt03(tg) 11.7 +# WER on rt03(fg) 11.5 +# Final train prob -0.061 +# Final valid prob -0.082 +# Final train prob (xent) -0.698 +# Final valid prob (xent) -0.8108 +# num-params=41.3M + +# ./steps/info/chain_dir_info.pl exp/chain/blstm_6j_sp +# exp/chain/blstm_6j_sp: num-iters=2384 nj=3..16 num-params=41.3M dim=40+100->6149 combine=-0.075->-0.074 (over 15) +# xent:train/valid[1587,2383,final]=(-0.754,-0.710,-0.698/-0.828,-0.824,-0.811) +# logprob:train/valid[1587,2383,final]=(-0.070,-0.063,-0.061/-0.082,-0.084,-0.082) # ./local/chain/show_chain_wer.sh blstm_6j_sp -# %WER 15.2 | 2628 21594 | 87.0 8.2 4.8 2.2 15.2 52.0 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys -# %WER 12.1 | 4459 42989 | 89.8 6.8 3.4 1.9 12.1 49.4 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys -# %WER 8.5 | 1831 21395 | 92.7 5.1 2.1 1.3 8.5 44.0 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_8_1.0/eval2000_hires.ctm.swbd.filt.sys -# %WER 15.0 | 2628 21594 | 87.2 8.1 4.7 2.2 15.0 51.4 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys -# %WER 11.9 | 4459 42989 | 90.0 6.7 3.3 1.8 11.9 48.6 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.filt.sys -# %WER 8.5 | 1831 21395 | 92.7 5.0 2.3 1.2 8.5 43.7 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 16.0 | 2628 21594 | 86.3 8.7 5.0 2.3 16.0 53.8 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_6_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 12.3 | 4459 42989 | 89.3 6.6 4.1 1.6 12.3 49.4 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys +# %WER 8.3 | 1831 21395 | 92.8 4.8 2.4 1.1 8.3 41.8 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 15.7 | 2628 21594 | 86.5 8.5 5.0 2.3 15.7 53.2 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_6_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 12.2 | 4459 42989 | 89.7 6.9 3.4 2.0 12.2 50.1 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_6_0.0/eval2000_hires.ctm.filt.sys +# %WER 8.2 | 1831 21395 | 93.0 4.8 2.2 1.2 8.2 41.6 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys # ./local/chain/show_chain_rt03_wer.sh blstm_6j_sp -# %WER 10.1 | 3970 36721 | 91.1 5.3 3.6 1.2 10.1 43.8 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys -# %WER 11.9 | 8420 76157 | 89.6 6.6 3.8 1.5 11.9 45.2 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys -# %WER 13.5 | 4450 39436 | 88.2 7.9 3.9 1.8 13.5 46.4 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.swbd.filt.sys -# %WER 9.7 | 3970 36721 | 91.5 5.1 3.5 1.2 9.7 43.4 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys -# %WER 11.6 | 8420 76157 | 89.9 6.5 3.6 1.5 11.6 44.7 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.filt.sys -# %WER 13.3 | 4450 39436 | 88.5 7.7 3.8 1.8 13.3 45.8 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.swbd.filt.sys +# %WER 9.9 | 3970 36721 | 91.3 5.3 3.4 1.2 9.9 43.6 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.7 | 8420 76157 | 89.6 6.3 4.1 1.3 11.7 44.7 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 13.3 | 4450 39436 | 88.2 7.5 4.3 1.5 13.3 45.3 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys +# %WER 9.7 | 3970 36721 | 91.4 5.2 3.4 1.1 9.7 43.1 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.5 | 8420 76157 | 89.8 6.2 4.0 1.3 11.5 44.3 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 13.2 | 4450 39436 | 88.3 7.3 4.3 1.5 13.2 45.1 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys set -e @@ -140,14 +146,14 @@ if [ $stage -le 12 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults - lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 - lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 - lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 - lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 - lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 - lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 + fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 + fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 ## adding the layers for chain branch output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh index d057470552f..bccd61533d2 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh @@ -7,6 +7,7 @@ # I just apply renorm component in TDNN layers. # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp # System tdnn_lstm_1a_sp +# num-params 39.7M # WER on eval2000(tg) 12.3 # [looped:] 12.2 # WER on eval2000(fg) 12.1 @@ -20,6 +21,11 @@ # Final train prob (xent) -0.882 # Final valid prob (xent) -0.9393 +# ./steps/info/chain_dir_info.pl exp/chain/tdnn_lstm_1a_sp +#exp/chain/tdnn_lstm_1a_sp: num-iters=2384 nj=3..16 num-params=39.7M dim=40+100->6149 combine=-0.097->-0.086 +#xent:train/valid[1587,2383,final]=(-0.949,-0.898,-0.882/-0.998,-0.949,-0.939) +#logprob:train/valid[1587,2383,final]=(-0.079,-0.075,-0.074/-0.087,-0.082,-0.084) + # ./show_chain_wer.sh tdnn_lstm_1a_sp # %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys # %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh new file mode 100644 index 00000000000..2272f746ab3 --- /dev/null +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh @@ -0,0 +1,310 @@ +#!/bin/bash +# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng +# Apache 2.0 + +# Similar to swbd\s5c\local\chain\tuning\run_tdnn_lstm_1e.sh +# Difference between tdnn_lstm_1a and tdnn_lstm_1b: +# chunk width 150 140,100,160 +# xent_regularize 0.025 0.01 +# minibatch 64 64,32 +# frames-per-iter 1200000 1500000 +# batchnorm in TDNN No Yes +# Dropout in LSTM No Yes + +# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1b_sp +# System tdnn_lstm_1a_sp tdnn_lstm_1b_sp +# num-params 39.7M 39.7M +# WER on eval2000(tg) 12.3 12.3 +# [looped:] 12.2 12.3 +# WER on eval2000(fg) 12.1 12.0 +# [looped:] 12.1 12.2 +# WER on rt03(tg) 11.6 11.4 +# [looped:] 11.6 11.6 +# WER on rt03(fg) 11.3 11.1 +# [looped:] 11.3 11.3 +# Final train prob -0.074 -0.087 +# Final valid prob -0.084 -0.088 +# Final train prob (xent) -0.882 -1.015 +# Final valid prob (xent) -0.9393 -0.9837 + +#./steps/info/chain_dir_info.pl exp/chain/tdnn_lstm_1b_sp +#exp/chain/tdnn_lstm_1b_sp: num-iters=1909 nj=3..16 num-params=39.7M dim=40+100->6149 combine=-0.087->-0.086 (over 5) +#xent:train/valid[1270,1908,final]=(-1.37,-1.02,-1.01/-1.31,-1.00,-0.984) +#logprob:train/valid[1270,1908,final]=(-0.108,-0.088,-0.087/-0.103,-0.091,-0.088) + + +# online results +# Eval2000 +#%WER 15.9 | 2628 21594 | 86.0 8.6 5.4 1.9 15.9 53.5 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +#%WER 12.3 | 4459 42989 | 89.1 6.8 4.1 1.5 12.3 49.2 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys +#%WER 8.6 | 1831 21395 | 92.5 5.2 2.3 1.1 8.6 42.6 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_tg/score_8_1.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 15.7 | 2628 21594 | 86.2 8.5 5.3 1.9 15.7 53.0 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +#%WER 12.1 | 4459 42989 | 89.3 6.6 4.0 1.5 12.1 48.4 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys +#%WER 8.5 | 1831 21395 | 92.5 4.9 2.5 1.0 8.5 41.1 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys + +# online results +# RT03 +#%WER 9.4 | 3970 36721 | 91.4 5.0 3.5 0.9 9.4 39.5 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +#%WER 11.6 | 8420 76157 | 89.5 6.4 4.1 1.1 11.6 42.0 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys +#%WER 13.5 | 4450 39436 | 87.6 7.3 5.0 1.1 13.5 44.5 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys +#%WER 9.2 | 3970 36721 | 91.6 4.9 3.5 0.9 9.2 39.3 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +#%WER 11.3 | 8420 76157 | 89.8 6.2 4.0 1.1 11.3 41.6 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys +#%WER 13.2 | 4450 39436 | 88.0 7.4 4.6 1.2 13.2 43.6 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_lstm_1b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +frames_per_chunk=140,100,160 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.01 +self_repair_scale=0.00001 +label_delay=5 +dropout_schedule='0,0@0.20,0.2@0.50,0' +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + lstm_opts="decay-time=20 dropout-proportion=0.0" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi + +test_online_decoding=false +lang=data/lang_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh index 2de8d774451..737e0571b07 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh @@ -8,21 +8,45 @@ # and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar # results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs). -# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_opgru_1a_sp -# System tdnn_lstm_1a_sp tdnn_opgru_1a_sp -# WER on eval2000(tg) 12.3 11.6 -# [looped:] 12.2 11.7 -# WER on eval2000(fg) 12.1 11.6 -# [looped:] 12.1 11.6 -# WER on rt03(tg) 11.6 10.9 -# [looped:] 11.6 10.9 -# WER on rt03(fg) 11.3 10.7 -# [looped:] 11.3 10.7 -# Final train prob -0.074 -0.087 -# Final valid prob -0.084 -0.092 -# Final train prob (xent) -0.882 -1.007 -# Final valid prob (xent) -0.9393 -1.0350 - +# ./local/chain/compare_wer_general.sh tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp +# num parameter 39.7M 39.7M 34.9M +# System tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp +# WER on eval2000(tg) 12.3 12.3 11.7 +# [looped:] 12.2 12.3 11.6 +# WER on eval2000(fg) 12.1 12.0 11.7 +# [looped:] 12.1 12.2 11.6 +# WER on rt03(tg) 11.6 11.4 11.0 +# [looped:] 11.6 11.6 11.0 +# WER on rt03(fg) 11.3 11.1 10.7 +# [looped:] 11.3 11.3 10.8 +# Final train prob -0.074 -0.087 -0.085 +# Final valid prob -0.084 -0.088 -0.093 +# Final train prob (xent) -0.882 -1.015 -0.972 +# Final valid prob (xent) -0.9393 -0.9837 -1.0275 + +#./steps/info/chain_dir_info.pl exp/chain/tdnn_opgru_1a_sp +#exp/chain/tdnn_opgru_1a_sp: num-iters=2384 nj=3..16 num-params=34.9M dim=40+100->6149 combine=-0.096->-0.095 (over 8) +#xent:train/valid[1587,2383,final]=(-1.46,-0.960,-0.972/-1.49,-1.02,-1.03) +#logprob:train/valid[1587,2383,final]=(-0.114,-0.086,-0.085/-0.114,-0.094,-0.093) + +# online results +# Eval2000 +# %WER 14.7 | 2628 21594 | 87.3 8.5 4.2 2.0 14.7 50.8 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 11.7 | 4459 42989 | 89.9 7.0 3.1 1.7 11.7 48.1 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys +# %WER 8.3 | 1831 21395 | 92.7 4.9 2.4 1.0 8.3 42.2 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 14.7 | 2628 21594 | 87.4 8.5 4.1 2.1 14.7 50.5 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 11.6 | 4459 42989 | 90.1 6.9 3.0 1.7 11.6 47.6 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.filt.sys +# %WER 8.1 | 1831 21395 | 92.9 4.8 2.3 1.1 8.1 41.8 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys + +# online results +# RT03 +# %WER 8.9 | 3970 36721 | 92.1 5.3 2.5 1.1 8.9 37.3 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.0 | 8420 76157 | 90.1 6.1 3.8 1.1 11.0 41.0 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys +# %WER 13.0 | 4450 39436 | 88.3 7.7 4.0 1.3 13.0 43.1 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys +# %WER 8.6 | 3970 36721 | 92.4 4.9 2.8 1.0 8.6 37.2 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 10.8 | 8420 76157 | 90.4 6.2 3.4 1.2 10.8 40.0 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 12.8 | 4450 39436 | 88.6 7.5 4.0 1.4 12.8 42.5 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys + set -e @@ -125,7 +149,7 @@ if [ $stage -le 12 ]; then num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - lstm_opts="dropout-per-frame=true dropout-proportion=0.0 " + gru_opts="dropout-per-frame=true dropout-proportion=0.0 " mkdir -p $dir/configs cat < $dir/configs/network.xconfig @@ -143,13 +167,13 @@ if [ $stage -le 12 ]; then relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults - norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 - norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 - norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts ## adding the layers for chain branch output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh new file mode 100644 index 00000000000..762db86a8cf --- /dev/null +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh @@ -0,0 +1,308 @@ +#!/bin/bash +# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng +# Apache 2.0 + +# Similar to tdnn_lstm_1e (from egs/swbd/s5c). +# Difference between tdnn_opgru_1a and tdnn_opgru_1b: +# chunk width 150 140,100,160 +# xent_regularize 0.025 0.01 +# minibatch 64 64,32 +# frames-per-iter 1200000 1500000 + +# ./local/chain/compare_wer_general.sh tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp tdnn_opgru_1b_sp +# num parameter 39.7M 39.7M 34.9M 34.9M +# System tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp tdnn_opgru_1b_sp +# WER on eval2000(tg) 12.3 12.3 11.7 12.2 +# [looped:] 12.2 12.3 11.6 12.1 +# WER on eval2000(fg) 12.1 12.0 11.7 12.0 +# [looped:] 12.1 12.2 11.6 11.9 +# WER on rt03(tg) 11.6 11.4 11.0 11.3 +# [looped:] 11.6 11.6 11.0 11.3 +# WER on rt03(fg) 11.3 11.1 10.7 11.1 +# [looped:] 11.3 11.3 10.8 11.0 +# Final train prob -0.074 -0.087 -0.085 -0.097 +# Final valid prob -0.084 -0.088 -0.093 -0.093 +# Final train prob (xent) -0.882 -1.015 -0.972 -1.121 +# Final valid prob (xent) -0.9393 -0.9837 -1.0275 -1.0703 + + +#./steps/info/chain_dir_info.pl exp/chain/tdnn_opgru_1b_sp +# exp/chain/tdnn_opgru_1b_sp: num-iters=1807 nj=3..16 num-params=34.9M dim=40+100->6149 combine=-0.102->-0.101 (over 5) +# xent:train/valid[1202,1806,final]=(-1.70,-1.11,-1.12/-1.63,-1.06,-1.07) +# logprob:train/valid[1202,1806,final]=(-0.131,-0.098,-0.097/-0.123,-0.094,-0.093) + +# online results +# Eval2000 +#%WER 15.7 | 2628 21594 | 86.2 8.5 5.3 2.0 15.7 53.2 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +#%WER 12.2 | 4459 42989 | 89.3 6.7 4.0 1.5 12.2 48.9 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys +#%WER 8.5 | 1831 21395 | 92.6 5.0 2.4 1.0 8.5 41.7 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 15.6 | 2628 21594 | 86.4 8.3 5.3 2.0 15.6 52.5 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +#%WER 12.1 | 4459 42989 | 89.5 6.8 3.6 1.6 12.1 47.9 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.filt.sys +#%WER 8.4 | 1831 21395 | 92.7 4.9 2.4 1.1 8.4 41.3 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys + +# online results +# RT03 +#%WER 9.1 | 3970 36721 | 91.8 5.3 2.9 0.9 9.1 37.7 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_7_1.0/rt03_hires.ctm.fsh.filt.sys +#%WER 11.4 | 8420 76157 | 89.7 6.8 3.5 1.2 11.4 40.6 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys +#%WER 13.4 | 4450 39436 | 87.8 7.8 4.4 1.2 13.4 43.6 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys +#%WER 8.9 | 3970 36721 | 92.0 5.0 3.0 0.9 8.9 37.7 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +#%WER 11.1 | 8420 76157 | 90.0 6.3 3.7 1.1 11.1 40.4 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys +#%WER 13.2 | 4450 39436 | 88.1 7.5 4.4 1.3 13.2 42.9 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_opgru_1b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= +dropout_schedule='0,0@0.20,0.2@0.50,0' + +# training options +leftmost_questions_truncate=-1 +frames_per_chunk=140,100,160 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.01 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1) + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + gru_opts="dropout-per-frame=true dropout-proportion=0.0 " + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + + ## adding the layers for chain branch + output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_chunk \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk_primary" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi + +test_online_decoding=true +lang=data/lang_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/fisher_swbd/s5/local/chain/show_chain_wer.sh b/egs/fisher_swbd/s5/local/chain/show_chain_wer.sh deleted file mode 100644 index ce693c8ad56..00000000000 --- a/egs/fisher_swbd/s5/local/chain/show_chain_wer.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -for l in $*; do - grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_tg/score*/eval2000_hires.ctm.callhm.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_tg/score*/eval2000_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_tg/score*/eval2000_hires.ctm.swbd.filt.sys | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_fg/score*/eval2000_hires.ctm.callhm.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_fg/score*/eval2000_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_fg/score*/eval2000_hires.ctm.swbd.filt.sys | utils/best_wer.sh -done diff --git a/egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh b/egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh deleted file mode 100644 index 6aca067a84c..00000000000 --- a/egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -for l in $*; do - grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_tg/score*/rt03_hires.ctm.fsh.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_tg/score*/rt03_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_tg/score*/rt03_hires.ctm.swbd.filt.sys | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_fg/score*/rt03_hires.ctm.fsh.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_fg/score*/rt03_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh -done -for l in $*; do - grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_fg/score*/rt03_hires.ctm.swbd.filt.sys | utils/best_wer.sh -done From 8596bbf5488d2339908f6eb73ffa2d7654711ab4 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jan 2018 14:14:39 -0500 Subject: [PATCH 076/184] [egs] removing broken link in babel multilang setup (#2177) --- egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh | 1 - 1 file changed, 1 deletion(-) delete mode 120000 egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh diff --git a/egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh b/egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh deleted file mode 120000 index 5065f95a98c..00000000000 --- a/egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh +++ /dev/null @@ -1 +0,0 @@ -../../../../babel/s5d/local/datasets/unsupervised_uem.sh \ No newline at end of file From 476cb3f6b37c4146057ea5b1f916469fdd2f2273 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jan 2018 14:37:20 -0500 Subject: [PATCH 077/184] [egs] fixes for the IAM example (#2176) --- egs/iam/v1/local/check_tools.sh | 45 ++++++++++++++++++++++++++++++++ egs/iam/v1/local/prepare_data.sh | 1 + egs/iam/v1/local/prepare_dict.sh | 2 +- egs/iam/v1/run.sh | 9 +++++-- 4 files changed, 54 insertions(+), 3 deletions(-) create mode 100755 egs/iam/v1/local/check_tools.sh diff --git a/egs/iam/v1/local/check_tools.sh b/egs/iam/v1/local/check_tools.sh new file mode 100755 index 00000000000..aa4fe70fa64 --- /dev/null +++ b/egs/iam/v1/local/check_tools.sh @@ -0,0 +1,45 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 2>/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 + + diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh index 1350c5841df..e751d5ff71a 100755 --- a/egs/iam/v1/local/prepare_data.sh +++ b/egs/iam/v1/local/prepare_data.sh @@ -35,6 +35,7 @@ if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" echo "... and then call this script again with --username --password " echo "" + exit 1 fi lines=data/local/lines diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh index 77a46df384f..0c3bb325023 100755 --- a/egs/iam/v1/local/prepare_dict.sh +++ b/egs/iam/v1/local/prepare_dict.sh @@ -37,7 +37,7 @@ while(<>){ @A = split; }' | sort > $dir/lexicon.txt -sed -i "s/#//" $dir/nonsilence_phones.txt +sed -i '' "s/#//" $dir/nonsilence_phones.txt echo ' SIL' >> $dir/lexicon.txt echo ' SIL' >> $dir/lexicon.txt diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index c8ebb9ae649..d5f66ca4110 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -7,7 +7,8 @@ set -e stage=0 nj=20 - +username= +password= # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -20,9 +21,13 @@ iam_database=/export/corpora5/handwriting_ocr/IAM . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. + +./local/check_tools.sh + if [ $stage -le 0 ]; then echo "$0: Preparing data..." - local/prepare_data.sh --download-dir "$iam_database" + local/prepare_data.sh --download-dir "$iam_database" \ + --username "$username" --password "$password" fi mkdir -p data/{train,test}/data From 49b1562edba0fb470a61a8b09dea48e6a6cc0fd1 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 23 Jan 2018 16:57:04 -0500 Subject: [PATCH 078/184] [egs] Add slightly fixed example. --- .../s5c/local/chain/tuning/run_tdnn_7m19h.sh | 14 +- .../s5c/local/chain/tuning/run_tdnn_7m23h.sh | 519 ++++++++++++++++++ .../local/chain/tuning/run_tdnn_lstm_1m.sh | 5 +- 3 files changed, 530 insertions(+), 8 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh index b509517da68..9ce9a790e2f 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh @@ -3,13 +3,13 @@ # 7m19h is as 7m19e but with an extra bypass connection. A bit better. # local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp -# System tdnn7m19e_sp tdnn7m19h_sp -# WER on train_dev(tg) 12.75 12.65 -# WER on train_dev(fg) 11.77 11.57 -# WER on eval2000(tg) 15.5 15.3 -# WER on eval2000(fg) 14.0 13.7 -# WER on rt03(tg) 18.9 18.8 -# WER on rt03(fg) 16.4 16.4 +# System tdnn7m19e_sp tdnn7m19h_sp [rerun of 17m19h:] +# WER on train_dev(tg) 12.75 12.65 12.61 +# WER on train_dev(fg) 11.77 11.57 11.72 +# WER on eval2000(tg) 15.5 15.3 15.4 +# WER on eval2000(fg) 14.0 13.7 13.7 +# WER on rt03(tg) 18.9 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 16.3 # Final train prob -0.092 -0.091 # Final valid prob -0.102 -0.102 # Final train prob (xent) -1.094 -1.091 diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh new file mode 100755 index 00000000000..7761cb1c24e --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh @@ -0,0 +1,519 @@ +#!/bin/bash + +# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'. +# Seems slightly better. The comparison below includes our old TDNN+LSTM result +# with dropout, to show that we're doing better than that now. + +# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp +# System tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp +# WER on train_dev(tg) 12.33 12.38 12.28 +# WER on train_dev(fg) 11.42 11.44 11.21 +# WER on eval2000(tg) 15.2 15.1 15.0 +# WER on eval2000(fg) 13.8 13.6 13.5 +# WER on rt03(tg) 18.6 18.4 18.5 +# WER on rt03(fg) 16.3 16.1 16.1 +# Final train prob -0.082 -0.084 -0.083 +# Final valid prob -0.099 -0.098 -0.097 +# Final train prob (xent) -0.959 -1.049 -1.036 +# Final valid prob (xent) -1.0305 -1.0661 -1.0629 +# Num-parameters 39558436 23120164 23513380 +# +# 7m23b2 is as 7m23b but fixing an issue at the last layers. +# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the +# splicing in 2 stages. Interestingly, objf is not better than 23, but +# WER is slightly better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# System tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# WER on train_dev(tg) 12.55 12.23 12.38 +# WER on train_dev(fg) 11.52 11.29 11.44 +# WER on eval2000(tg) 15.2 15.2 15.1 +# WER on eval2000(fg) 13.6 13.7 13.6 +# WER on rt03(tg) 18.6 18.7 18.4 +# WER on rt03(fg) 16.2 16.3 16.1 +# Final train prob -0.089 -0.083 -0.084 +# Final valid prob -0.101 -0.097 -0.098 +# Final train prob (xent) -1.080 -1.025 -1.049 +# Final valid prob (xent) -1.0990 -1.0548 -1.0661 +# Num-parameters 21055012 23120164 23120164 + + +# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and +# reducing the dim of the linear components... it's basically an attempt to +# reverse the factorization to have the splicing at a different point. +# + +# 7m19m is as 7m19l but with more skip connections +# Hm-- seems better than 19h. +# +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# System tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# WER on train_dev(tg) 12.61 12.72 12.55 +# WER on train_dev(fg) 11.72 11.62 11.52 +# WER on eval2000(tg) 15.4 15.4 15.2 +# WER on eval2000(fg) 13.7 13.8 13.6 +# WER on rt03(tg) 18.9 18.9 18.6 +# WER on rt03(fg) 16.3 16.4 16.2 +# Final train prob -0.091 -0.091 -0.089 +# Final valid prob -0.102 -0.103 -0.101 +# Final train prob (xent) -1.098 -1.095 -1.080 +# Final valid prob (xent) -1.1031 -1.1191 -1.0990 +# Num-parameters 21055012 20268580 21055012 +# +# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before +# doing the Append... doing this by inserting a linear-component between +# pairs of relu-batchnorm-layers. +# A little worse. +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp +# System tdnn7m19h_sp tdnn7m19l_sp +# WER on train_dev(tg) 12.65 12.72 +# WER on train_dev(fg) 11.57 11.62 +# WER on eval2000(tg) 15.3 15.4 +# WER on eval2000(fg) 13.7 13.8 +# WER on rt03(tg) 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.091 -0.091 +# Final valid prob -0.102 -0.103 +# Final train prob (xent) -1.091 -1.095 +# Final valid prob (xent) -1.1064 -1.1191 +# Num-parameters 21055012 20268580 + + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m23h +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1536 + linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn2l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1536 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn4l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) + linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536 + linear-component name=tdnn11l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1536 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1536 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh index 4b2c93082d9..b50692616c4 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh @@ -11,6 +11,9 @@ #WER on train_dev(fg) 11.59 11.46 11.41 #WER on eval2000(tg) 14.8 14.8 14.9 #WER on eval2000(fg) 13.5 13.5 13.6 +# WER on rt03(tg) 18.6 +# WER on rt03(fg) 16.3 + #Final train prob -0.069 -0.081 #Final valid prob -0.095 -0.100 #Final train prob (xent) -0.913 -0.950 @@ -258,7 +261,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then $lang exp/nnet3/extractor $dir ${dir}_online rm $dir/.error 2>/dev/null || true - for decode_set in train_dev eval2000 rt03; do + for decode_set in train_dev eval2000 $maybe_rt03; do ( # note: we just give it "$decode_set" as it only uses the wav.scp, the # feature type does not matter. From 4bb2c5c2f3cc5b45db8db7f4150f4b7c0fa3a483 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 23 Jan 2018 18:28:55 -0500 Subject: [PATCH 079/184] [src] Speed fix to online decoding (thanks: David van Leeuwen) --- src/nnet3/decodable-simple-looped.cc | 19 ++++++----- src/nnet3/nnet-compile-looped.cc | 50 ++++++++++++++++++---------- src/nnet3/nnet-compile-looped.h | 34 ++++++++++++++++--- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc index df18d605b7d..d4edb440d5a 100644 --- a/src/nnet3/decodable-simple-looped.cc +++ b/src/nnet3/decodable-simple-looped.cc @@ -60,9 +60,10 @@ void DecodableNnetSimpleLoopedInfo::Init( KALDI_ASSERT(IsSimpleNnet(*nnet)); has_ivectors = (nnet->InputDim("ivector") > 0); int32 left_context, right_context; + int32 extra_right_context = 0; ComputeSimpleNnetContext(*nnet, &left_context, &right_context); frames_left_context = left_context + opts.extra_left_context_initial; - frames_right_context = right_context; + frames_right_context = right_context + extra_right_context; frames_per_chunk = GetChunkSize(*nnet, opts.frame_subsampling_factor, opts.frames_per_chunk); output_dim = nnet->OutputDim("output"); @@ -73,14 +74,14 @@ void DecodableNnetSimpleLoopedInfo::Init( ModifyNnetIvectorPeriod(ivector_period, nnet); int32 num_sequences = 1; // we're processing one utterance at a time. - int32 extra_right_context = 0; - CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk, - opts.frame_subsampling_factor, - ivector_period, - opts.extra_left_context_initial, - extra_right_context, - num_sequences, - &request1, &request2, &request3); + + CreateLoopedComputationRequest(*nnet, frames_per_chunk, + opts.frame_subsampling_factor, + ivector_period, + frames_left_context, + frames_right_context, + num_sequences, + &request1, &request2, &request3); CompileLooped(*nnet, opts.optimize_config, request1, request2, request3, &computation); diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc index b0ca42f15ab..fa8a2322e5a 100644 --- a/src/nnet3/nnet-compile-looped.cc +++ b/src/nnet3/nnet-compile-looped.cc @@ -150,26 +150,24 @@ static void CreateComputationRequestInternal( } -void CreateLoopedComputationRequestSimple(const Nnet &nnet, - int32 chunk_size, - int32 frame_subsampling_factor, - int32 ivector_period, - int32 extra_left_context_begin, - int32 extra_right_context, - int32 num_sequences, - ComputationRequest *request1, - ComputationRequest *request2, - ComputationRequest *request3) { +void CreateLoopedComputationRequest(const Nnet &nnet, + int32 chunk_size, + int32 frame_subsampling_factor, + int32 ivector_period, + int32 left_context_begin, + int32 right_context, + int32 num_sequences, + ComputationRequest *request1, + ComputationRequest *request2, + ComputationRequest *request3) { bool has_ivector = (nnet.InputDim("ivector") > 0); - int32 left_context, right_context; - ComputeSimpleNnetContext(nnet, &left_context, &right_context); KALDI_ASSERT(chunk_size % frame_subsampling_factor == 0 && chunk_size % nnet.Modulus() == 0 && chunk_size % ivector_period == 0); - KALDI_ASSERT(extra_left_context_begin >= 0 && extra_right_context >= 0); + KALDI_ASSERT(left_context_begin >= 0 && right_context >= 0); // note, 'end' is one past the last one. - int32 chunk1_input_begin_t = - left_context - extra_left_context_begin, - chunk1_input_end_t = chunk_size + right_context + extra_right_context, + int32 chunk1_input_begin_t = - left_context_begin, + chunk1_input_end_t = chunk_size + right_context, chunk2_input_begin_t = chunk1_input_end_t, chunk2_input_end_t = chunk2_input_begin_t + chunk_size, chunk3_input_begin_t = chunk2_input_end_t, @@ -349,10 +347,26 @@ void CompileLooped(const Nnet &nnet, } +void CreateLoopedComputationRequestSimple(const Nnet &nnet, + int32 chunk_size, + int32 frame_subsampling_factor, + int32 ivector_period, + int32 extra_left_context_begin, + int32 extra_right_context, + int32 num_sequences, + ComputationRequest *request1, + ComputationRequest *request2, + ComputationRequest *request3) { + bool has_ivector = (nnet.InputDim("ivector") > 0); + int32 left_context, right_context; + ComputeSimpleNnetContext(nnet, &left_context, &right_context); - - - + CreateLoopedComputationRequest(nnet, chunk_size, frame_subsampling_factor, + ivector_period, + extra_left_context_begin + left_context, + extra_right_context + right_context, + num_sequences, request1, request2, request3); +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h index 2ebb371ecc5..7c1bb655c42 100644 --- a/src/nnet3/nnet-compile-looped.h +++ b/src/nnet3/nnet-compile-looped.h @@ -132,10 +132,17 @@ void ModifyNnetIvectorPeriod(int32 ivector_period, 'nnet' before calling this function; otherwise the neural net will most likely not actually be able to consume the iVector with this frequency. - @param [in] extra_left_context_begin The additional left-context that - should be supplied to the network on top of the minimum - that the network requires. We call this extra_left_context_begin - because this only relates to the start of the utterance (t=0). + @param [in] left_context_begin This should be the left-context of the network + plus any additional left-context (provided via the option + --extra-left-context-begin) that should be supplied to the + network on top of the minimum that the network requires. We call + this left_context_begin because this only relates to the + start of the utterance (t=0). + @param [in] right_context This should be the right-context of the network, + plus any additional right-context ("extra-right-context") that + should be supplied to the network on top of the minimum that the + network requires (currently extra-right-context != 0 is is not + supported at the command-line level). @param [in] num_sequences The number of separate 'n' values to put in the computation; normally this will be just 1, but it can be increased to allow simultaneous operation on multiple streams of input. @@ -152,6 +159,25 @@ void ModifyNnetIvectorPeriod(int32 ivector_period, @param [out] request3 The third of the 3 requests that this function generates. It will be the same as request2, except for a time offset. */ +void CreateLoopedComputationRequest(const Nnet &nnet, + int32 chunk_size, + int32 frame_subsampling_factor, + int32 ivector_period, + int32 left_context_begin, + int32 right_context, + int32 num_sequences, + ComputationRequest *request1, + ComputationRequest *request2, + ComputationRequest *request3); + + +/** + This function is deprecated. It has the same interface as + CreateLoopedComputationRequest(), except that the left and right context are + specified in a different way (as just the 'extra' part). It is deprecated because + this function has to work out the left and right context of the network, which + turns out to be quite slow if it's done after you call ModifyNnetIvectorPeriod(). +*/ void CreateLoopedComputationRequestSimple(const Nnet &nnet, int32 chunk_size, int32 frame_subsampling_factor, From b73bb12e41e2843a3fd0c660b15506a3bc985a7d Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Wed, 24 Jan 2018 04:02:03 +0330 Subject: [PATCH 080/184] [egs] Make sure scoring opts are passed to score_cer.sh in UW3 (#2181) --- egs/uw3/v1/local/score.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/egs/uw3/v1/local/score.sh b/egs/uw3/v1/local/score.sh index 9ea4701a833..3a6aeaa08ad 100755 --- a/egs/uw3/v1/local/score.sh +++ b/egs/uw3/v1/local/score.sh @@ -143,7 +143,9 @@ if [ $stage -le 1 ]; then fi fi -steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 $data $lang_or_graph $dir +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ + --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ + $data $lang_or_graph $dir # If we got here, the scoring was successful. # As a small aid to prevent confusion, we remove all wer_{?,??} files; From 47de1452869f9128c6fa34bb3f1d06c503e4ef2d Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Tue, 23 Jan 2018 17:37:53 -0800 Subject: [PATCH 081/184] [scripts] Fix typos in scripts (#2182) --- egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh | 4 ++-- egs/wsj/s5/steps/nnet2/get_ivector_id.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh index dcfa6cf59b8..7c5d3a3254d 100755 --- a/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh +++ b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh @@ -7,8 +7,8 @@ #echo >&2 "$0 $@" # Print the command line for logging if [ $# != 2 ] ; then - echo >$2 "Usage: $0 " - echo >$2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem" + echo >&2 "Usage: $0 " + echo >&2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem" fi dir_a=$1 diff --git a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh index 7adcfac11c7..1ec60d1f514 100755 --- a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh +++ b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh @@ -16,8 +16,8 @@ if [ -f path.sh ]; then . ./path.sh; fi if [ $# != 1 ]; then - echo >$2 "Usage: $0 " - echo >$2 " e.g.: $0 exp/nnet3/extractor" + echo >&2 "Usage: $0 " + echo >&2 " e.g.: $0 exp/nnet3/extractor" exit 1 fi From 2e105fc455246540e53f64be19060d594abfc83f Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Wed, 24 Jan 2018 10:08:35 +0800 Subject: [PATCH 082/184] [scripts] Fix to get_num_frames.sh for large datasets, RE truncation in awk (#2174) --- egs/wsj/s5/utils/data/get_num_frames.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/utils/data/get_num_frames.sh b/egs/wsj/s5/utils/data/get_num_frames.sh index f3589b2eb06..996468631fa 100755 --- a/egs/wsj/s5/utils/data/get_num_frames.sh +++ b/egs/wsj/s5/utils/data/get_num_frames.sh @@ -22,4 +22,4 @@ fi frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 -awk -v s=$frame_shift '{n += $2} END{printf("%d\n", int(n / s))}' <$data/utt2dur +awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur From 7cf434ce1267cfed3101ee723aad7f9e7c42aebb Mon Sep 17 00:00:00 2001 From: Szu-JuiChen <31828751+Szu-JuiChen@users.noreply.github.com> Date: Tue, 23 Jan 2018 21:41:53 -0500 Subject: [PATCH 083/184] [scripts] Fix python3 compatibility bug (#2184) --- egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py index 7b91b905c3a..56b9f69b3c9 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py +++ b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py @@ -68,7 +68,7 @@ phone_depth_counts = dict() # note: -1 is for all phones put in one bucket. -for p in [ -1 ] + phone_int2text.keys(): +for p in [ -1 ] + list(phone_int2text.keys()): phone_depth_counts[p] = defaultdict(int) total_frames = 0 From ed84a510745cb00630138b8e1f19538d29536728 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Wed, 24 Jan 2018 13:32:40 -0500 Subject: [PATCH 084/184] [scripts] Prevent crash when input_model is None (#2188) --- egs/wsj/s5/steps/nnet3/chain/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index b62f5510e3c..6896da67f73 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -218,7 +218,7 @@ def process_args(args): if (not os.path.exists(args.dir) or (not os.path.exists(args.dir+"/configs") and - not os.path.exists(args.input_model))): + (args.input_model is None or not os.path.exists(args.input_model)))): raise Exception("This script expects {0} to exist. Also either " "--trainer.input-model option as initial 'raw' model " "(used as 0.raw in the script) should be supplied or " From 6fed4c7dd0b64f82611001793e383ab657eb3a65 Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Wed, 24 Jan 2018 10:33:14 -0800 Subject: [PATCH 085/184] [src] Remove CuDevice destructor (avoid cuda-memcheck warnings) (#2185) --- src/cudamatrix/cu-device.cc | 8 -------- src/cudamatrix/cu-device.h | 1 - 2 files changed, 9 deletions(-) diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index ba0db7df08d..9b0976b05ad 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -547,14 +547,6 @@ CuDevice::CuDevice() : multi_threaded_(false) { } -CuDevice::~CuDevice() { - if (Enabled()) { - cublasDestroy(handle_); - cusparseDestroy(cusparse_handle_); - cudaDeviceReset(); - } -} - // The instance of the static singleton CuDevice CuDevice::global_device_; } diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index c355549648b..99105355a8f 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -47,7 +47,6 @@ class CuTimer; class CuDevice { // Singleton object (there should only be one instantiated per program) public: - ~CuDevice(); static inline CuDevice& Instantiate() { return global_device_; } inline cublasHandle_t GetHandle() { return handle_; } From 7ee7893f9d1ece40029d2e8ec1bc91a9708c66b4 Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Wed, 24 Jan 2018 14:45:44 -0500 Subject: [PATCH 086/184] [src] Fix nnet3 back-compatibility issue (thx: @satmass) --- src/nnet3/nnet-simple-component.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 36fb0db520d..91906ac1ddf 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -361,6 +361,8 @@ void NormalizeComponent::Read(std::istream &is, bool binary) { if (token == "") { ReadBasicType(is, binary, &add_log_stddev_); ReadToken(is, binary, &token); + } else { + add_log_stddev_ = false; } if (token == "") { // back-compatibility code. From f861b00ca0087e7f265e5ce80cbe199ebf44d046 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 25 Jan 2018 12:42:33 -0500 Subject: [PATCH 087/184] [src] Speed fix to online decoding (thanks: David van Leeuwen) (#2180) --- src/nnet3/decodable-simple-looped.cc | 19 ++++++----- src/nnet3/nnet-compile-looped.cc | 50 ++++++++++++++++++---------- src/nnet3/nnet-compile-looped.h | 34 ++++++++++++++++--- src/nnet3bin/nnet3-am-copy.cc | 13 +++++++- 4 files changed, 84 insertions(+), 32 deletions(-) diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc index df18d605b7d..d4edb440d5a 100644 --- a/src/nnet3/decodable-simple-looped.cc +++ b/src/nnet3/decodable-simple-looped.cc @@ -60,9 +60,10 @@ void DecodableNnetSimpleLoopedInfo::Init( KALDI_ASSERT(IsSimpleNnet(*nnet)); has_ivectors = (nnet->InputDim("ivector") > 0); int32 left_context, right_context; + int32 extra_right_context = 0; ComputeSimpleNnetContext(*nnet, &left_context, &right_context); frames_left_context = left_context + opts.extra_left_context_initial; - frames_right_context = right_context; + frames_right_context = right_context + extra_right_context; frames_per_chunk = GetChunkSize(*nnet, opts.frame_subsampling_factor, opts.frames_per_chunk); output_dim = nnet->OutputDim("output"); @@ -73,14 +74,14 @@ void DecodableNnetSimpleLoopedInfo::Init( ModifyNnetIvectorPeriod(ivector_period, nnet); int32 num_sequences = 1; // we're processing one utterance at a time. - int32 extra_right_context = 0; - CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk, - opts.frame_subsampling_factor, - ivector_period, - opts.extra_left_context_initial, - extra_right_context, - num_sequences, - &request1, &request2, &request3); + + CreateLoopedComputationRequest(*nnet, frames_per_chunk, + opts.frame_subsampling_factor, + ivector_period, + frames_left_context, + frames_right_context, + num_sequences, + &request1, &request2, &request3); CompileLooped(*nnet, opts.optimize_config, request1, request2, request3, &computation); diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc index b0ca42f15ab..fa8a2322e5a 100644 --- a/src/nnet3/nnet-compile-looped.cc +++ b/src/nnet3/nnet-compile-looped.cc @@ -150,26 +150,24 @@ static void CreateComputationRequestInternal( } -void CreateLoopedComputationRequestSimple(const Nnet &nnet, - int32 chunk_size, - int32 frame_subsampling_factor, - int32 ivector_period, - int32 extra_left_context_begin, - int32 extra_right_context, - int32 num_sequences, - ComputationRequest *request1, - ComputationRequest *request2, - ComputationRequest *request3) { +void CreateLoopedComputationRequest(const Nnet &nnet, + int32 chunk_size, + int32 frame_subsampling_factor, + int32 ivector_period, + int32 left_context_begin, + int32 right_context, + int32 num_sequences, + ComputationRequest *request1, + ComputationRequest *request2, + ComputationRequest *request3) { bool has_ivector = (nnet.InputDim("ivector") > 0); - int32 left_context, right_context; - ComputeSimpleNnetContext(nnet, &left_context, &right_context); KALDI_ASSERT(chunk_size % frame_subsampling_factor == 0 && chunk_size % nnet.Modulus() == 0 && chunk_size % ivector_period == 0); - KALDI_ASSERT(extra_left_context_begin >= 0 && extra_right_context >= 0); + KALDI_ASSERT(left_context_begin >= 0 && right_context >= 0); // note, 'end' is one past the last one. - int32 chunk1_input_begin_t = - left_context - extra_left_context_begin, - chunk1_input_end_t = chunk_size + right_context + extra_right_context, + int32 chunk1_input_begin_t = - left_context_begin, + chunk1_input_end_t = chunk_size + right_context, chunk2_input_begin_t = chunk1_input_end_t, chunk2_input_end_t = chunk2_input_begin_t + chunk_size, chunk3_input_begin_t = chunk2_input_end_t, @@ -349,10 +347,26 @@ void CompileLooped(const Nnet &nnet, } +void CreateLoopedComputationRequestSimple(const Nnet &nnet, + int32 chunk_size, + int32 frame_subsampling_factor, + int32 ivector_period, + int32 extra_left_context_begin, + int32 extra_right_context, + int32 num_sequences, + ComputationRequest *request1, + ComputationRequest *request2, + ComputationRequest *request3) { + bool has_ivector = (nnet.InputDim("ivector") > 0); + int32 left_context, right_context; + ComputeSimpleNnetContext(nnet, &left_context, &right_context); - - - + CreateLoopedComputationRequest(nnet, chunk_size, frame_subsampling_factor, + ivector_period, + extra_left_context_begin + left_context, + extra_right_context + right_context, + num_sequences, request1, request2, request3); +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h index 2ebb371ecc5..7c1bb655c42 100644 --- a/src/nnet3/nnet-compile-looped.h +++ b/src/nnet3/nnet-compile-looped.h @@ -132,10 +132,17 @@ void ModifyNnetIvectorPeriod(int32 ivector_period, 'nnet' before calling this function; otherwise the neural net will most likely not actually be able to consume the iVector with this frequency. - @param [in] extra_left_context_begin The additional left-context that - should be supplied to the network on top of the minimum - that the network requires. We call this extra_left_context_begin - because this only relates to the start of the utterance (t=0). + @param [in] left_context_begin This should be the left-context of the network + plus any additional left-context (provided via the option + --extra-left-context-begin) that should be supplied to the + network on top of the minimum that the network requires. We call + this left_context_begin because this only relates to the + start of the utterance (t=0). + @param [in] right_context This should be the right-context of the network, + plus any additional right-context ("extra-right-context") that + should be supplied to the network on top of the minimum that the + network requires (currently extra-right-context != 0 is is not + supported at the command-line level). @param [in] num_sequences The number of separate 'n' values to put in the computation; normally this will be just 1, but it can be increased to allow simultaneous operation on multiple streams of input. @@ -152,6 +159,25 @@ void ModifyNnetIvectorPeriod(int32 ivector_period, @param [out] request3 The third of the 3 requests that this function generates. It will be the same as request2, except for a time offset. */ +void CreateLoopedComputationRequest(const Nnet &nnet, + int32 chunk_size, + int32 frame_subsampling_factor, + int32 ivector_period, + int32 left_context_begin, + int32 right_context, + int32 num_sequences, + ComputationRequest *request1, + ComputationRequest *request2, + ComputationRequest *request3); + + +/** + This function is deprecated. It has the same interface as + CreateLoopedComputationRequest(), except that the left and right context are + specified in a different way (as just the 'extra' part). It is deprecated because + this function has to work out the left and right context of the network, which + turns out to be quite slow if it's done after you call ModifyNnetIvectorPeriod(). +*/ void CreateLoopedComputationRequestSimple(const Nnet &nnet, int32 chunk_size, int32 frame_subsampling_factor, diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc index 5f697356dbf..2230ae77c00 100644 --- a/src/nnet3bin/nnet3-am-copy.cc +++ b/src/nnet3bin/nnet3-am-copy.cc @@ -50,6 +50,7 @@ int main(int argc, char *argv[]) { std::string set_raw_nnet = ""; bool convert_repeated_to_block = false; BaseFloat scale = 1.0; + bool prepare_for_test = false; std::string nnet_config, edits_config, edits_str; ParseOptions po(usage); @@ -81,7 +82,11 @@ int main(int argc, char *argv[]) { " are set to this value."); po.Register("scale", &scale, "The parameter matrices are scaled" " by the specified value."); - + po.Register("prepare-for-test", &prepare_for_test, + "If true, prepares the model for test time (may reduce model size " + "slightly. Involves setting test mode in dropout and batch-norm " + "components, and calling CollapseModel() which may remove some " + "components."); po.Read(argc, argv); @@ -135,6 +140,12 @@ int main(int argc, char *argv[]) { if (scale != 1.0) ScaleNnet(scale, &(am_nnet.GetNnet())); + if (prepare_for_test) { + SetBatchnormTestMode(true, &am_nnet.GetNnet()); + SetDropoutTestMode(true, &am_nnet.GetNnet()); + CollapseModel(CollapseModelConfig(), &am_nnet.GetNnet()); + } + if (raw) { WriteKaldiObject(am_nnet.GetNnet(), nnet_wxfilename, binary_write); KALDI_LOG << "Copied neural net from " << nnet_rxfilename From 7c8e1a3f68bc5840c1dbe0ebcb5c6670221bce28 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 25 Jan 2018 14:08:23 -0500 Subject: [PATCH 088/184] [src] Some drafts of the compression code --- src/cudamatrix/cu-compressed-matrix.h | 143 ++++++++++++ src/nnet3/nnet-analyze.cc | 99 ++++++++- src/nnet3/nnet-analyze.h | 12 +- src/nnet3/nnet-computation.cc | 14 +- src/nnet3/nnet-computation.h | 10 +- src/nnet3/nnet-compute.cc | 33 +++ src/nnet3/nnet-compute.h | 8 + src/nnet3/nnet-optimize-utils.cc | 305 ++++++++++++++++++++++++++ src/nnet3/nnet-optimize-utils.h | 40 ++++ src/nnet3/nnet-optimize.cc | 11 +- src/nnet3/nnet-optimize.h | 13 ++ 11 files changed, 678 insertions(+), 10 deletions(-) create mode 100644 src/cudamatrix/cu-compressed-matrix.h diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h new file mode 100644 index 00000000000..557892ae266 --- /dev/null +++ b/src/cudamatrix/cu-compressed-matrix.h @@ -0,0 +1,143 @@ +// cudamatrix/cu-compressed-matrix.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + + +#ifndef KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_ +#define KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_ + +#include "cudamatrix/cu-matrix.h" + +namespace kaldi { + +/** + Class CuCompressedMatrixBase is an abstract base class that allows you to + compress a matrix of type CuMatrix. When you instantiate it you + would choose the child-class type (by allocating the appropriate child-class + type via 'new'). + */ +class CuCompressedMatrixBase { + public: + + /// Sets *this to an appropriately compressed copy of 'mat', which + /// includes resizing *this. The details of how this is done will be + /// different in different child classes. + virtual void CopyFromMat(CuMatrixBase &mat) = 0; + + /// Copies the contents of *this to 'mat', which should be + /// correctly sized beforehand. + virtual void CopyToMat(CuMatrixBase *mat) = 0; + + + // The number of rows in *this. + virtual int32 NumRows() = 0; + + // The number of columns in *this. + virtual int32 NumCols() = 0; + + ~CuCompressedMatrixBase() { } +}; + + + +/** + Class CuCompressedMatrix, templated on an integer type (expected to be one + of: int8, uint8, int16, uint16), this provides a way to approximate a + CuMatrix in a more memory-efficient format. It's used in nnet3 to + reduce memory use for large networks. + + It is *not* a CUDA equivalent for class CompressedMatrix (of + ../matrix/compressed-matrix.h). + */ +template +class CuCompressedMatrix: public CuCompressedMatrixBase { + public: + + + /// Constructor which sets 'scale_' according to + /// scale_ = range / std::numeric_limits::max(). + /// + /// range = 0 (only supported for I == int8) is a special case in which only + /// the sign of the input is retained; and when we reconstruct, the output + /// will be -1, 0 or 1. + CuCompressedMatrix(BaseFloat range); + + virtual void CopyFromMat(CuMatrixBase &mat); + + virtual void CopyToMat(CuMatrixBase *mat); + + virtual MatrixIndexT NumRows() { return num_rows_; } + + virtual MatrixIndexT NumCols() { return num_cols_; } + + + ~CuCompressedMatrix(); + + private: + + // The raw data. + I *data_; + + // Scale() affects how the raw data is interpreted as a floating point value. + // When uncompressing to a CuMatrix, we'll do + // f = scale_ * i + // where f is the floating point value we're writing to, and i is the integer + // value. + // + // scale_ = 0 is treated specially; in this case we just take notice of the + // sign of the input, and when uncompressing we do it with a scale such + // that the output becomes -1, 0 and 1. + BaseFloat scale_; + + MatrixIndexT num_rows_; + MatrixIndexT num_cols_; + // stride_ is currently always equal to num_cols_; it was added mainly to + // point the way to possible future extension. + MatrixIndexT stride_; +}; + + + +// This enum value is used to encode the type you want to instantiate +// a CuCompressedMatrix with. It's used in class NnetComputation +// (cast to int32) as one of the arguments of kCompressMatrix. +enum { + kCompressedMatrixInt8 = 1, + kCompressedMatrixUint8 = 2, + kCompressedMatrixInt16 = 3, + kCompressedMatrixUint16 = 4 +} CuCompressedMatrixType; + +/** + This function allocates a new CuCompressedMatrix with type determined + by t, and with the 'range' parameter provided (range must be >= 0, + 0 as a special case). + It will crash at runtime if called when CUDA is not compiled in, or not + enabled. + */ +CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, + BaseFloat range); + + + + + + + +#endif diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index 140a6f9140c..a2517989294 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -367,6 +367,14 @@ void ComputeCommandAttributes( vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr); break; } + case kCompressMatrix: { + vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr); + break; + } + case kUncompressMatrix: { + vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr); + break; + } case kAcceptInput: { vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr); break; @@ -555,6 +563,7 @@ void ComputationChecker::Check() { CheckComputationIndexes(); a_.Init(nnet_, computation_); CheckComputationMatrixAccesses(); + CheckComputationCompression(); CheckComputationUndefined(); CheckComputationDebugInfo(); if (config_.check_rewrite) @@ -679,6 +688,63 @@ void ComputationChecker::CheckComputationMatrixAccesses() const { } } +void ComputationChecker::CheckComputationCompression() const { + int32 num_matrices = a_.matrix_accesses.size(); + + // 'middle_command' will be the index of the command that separates + // the forward and backward passes. + int32 middle_command = -1; + for (size_t i = 0; i < computation->commands.size(); i++) { + if (computation->commands[i].command_type == kNoOperationMarker) { + middle_command = static_cast(i); + break; + } + } + for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) { + const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index]; + int32 num_accesses = accesses.accesses.size(); + for (int32 a = 0; a < num_accesses; a++) { + const Access &access = accesses.accesses[a]; + int32 command_index = accesses.command_inex; + const NnetComputation::Command &command = + computation_.commands[command_index]; + if (command.command_type == kUncompressMatrix) { + // check that the previous access to this matrix was a compression + // command. + KALDI_ASSERT( + a > 0 && computation_.commands[ + accesses.accesses[a-1].command_index].command_type == + kCompressMatrix); + + if (command.command_type == kCompressMatrix) { + // check that the next access to this matrix is an uncompression + // command. + int32 next_command_index = accesses.accesses[a+1].command_index; + KALDI_ASSERT(computation_.commands[next_command_index].command_type == + kUncompressMatrix && + command_index < middle_command && + next_command_index > middle_command); + if (command.alpha == 0.0) { + // alpha == 0.0 means we're only retaining the sign; we should + // only do this if this is the output of a ReLU. + // make sure there are only 2 commands after this: the uncompress + // command, a relu backprop command, and a deallocation command. + KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 && + num_accesses <= a + 4); + // make sure the previous access to that matrix was a ReLU + // propagation. + int32 previous_command_index = accesses.accesses[a-1].command_index; + const NnetComputation::Command &previous_command = + computation_.commands[previous_command_index]; + KALDI_ASSERT(previous_command.command_type == kPropagate && + nnet_.GetComponent(previous_command.arg1).Type() == + "RectifiedLinearComponent"); + } + } + } + } +} + /** This very basic check just makes sure that all indexes in the commands are within range, that dimensions agree with the request, that row/column dimensions @@ -931,6 +997,22 @@ void ComputationChecker::CheckComputationIndexes() const { } break; } + case kCompressMatrix: { + if (c.arg1 < 1 || c.arg1 >= num_submatrices || + !computation_.IsWholeMatrix(c.arg1)) + KALDI_ERR << "submatrix index out of range or invalid"; + if (c.arg2 < static_cast(kCompressedMatrixInt8) || + c.arg2 > static_cast(kCompressedMatrixUint16)) + KALDI_ERR << "Invalid compressed-matrix type."; + if (c.alpha < 0.0 || c.alpha > 1000.0 || + (c.alpha == 0.0 && c.arg1 != kCompressedMatrixInt8)) + KALDI_ERR << "Invalid alpha in kCompressMatrix command."; + } + case kUncompressMatrix: { + if (c.arg1 < 1 || c.arg1 >= num_submatrices || + !computation_.IsWholeMatrix(c.arg1)) + KALDI_ERR << "submatrix index out of range or invalid"; + } case kAcceptInput: case kProvideOutput: { if (c.arg1 < 1 || c.arg1 >= num_submatrices || !computation_.IsWholeMatrix(c.arg1)) @@ -1319,13 +1401,22 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { num_submatrices = computation.submatrices.size(); for (int32 command_index = 0; command_index < num_commands; ++command_index) { const NnetComputation::Command &c = computation.commands[command_index]; - int64 this_num_bytes = -100000000; + int64 this_num_bytes = -100000000, + this_compressed_num_bytes = -10000000; if (c.arg1 >= 0 && c.arg1 < num_submatrices) { // if arg1 could plausibly be a sub-matrix index... const NnetComputation::SubMatrixInfo &submat_info = computation.submatrices[c.arg1]; this_num_bytes = static_cast(sizeof(BaseFloat)) * submat_info.num_rows * submat_info.num_cols; + + if (c.arg2 >= static_cast(kCompressedMatrixInt8) && + c.arg2 <= static_cast(kCompressedMatrixUint16)) { + this_compressed_num_bytes = + ((c.arg2 == static_cast(kCompressedMatrixInt8) || + c.arg2 == static_cast(kCompressedMatrixUint8)) ? + 1 : 2) * submat_info.num_rows * submat_info.num_cols; + } } switch (c.command_type) { case kAllocMatrix: @@ -1335,6 +1426,12 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { case kDeallocMatrix: cur_memory_use -= this_num_bytes; break; + case kCompressMatrix: + cur_memory_use += this_compressed_num_bytes - this_num_bytes; + break; + case kUncompressMatrix: + cur_memory_use += this_num_bytes - this_compressed_num_bytes; + break; default: break; } diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h index a82cd4cb5b1..2966cf947e4 100644 --- a/src/nnet3/nnet-analyze.h +++ b/src/nnet3/nnet-analyze.h @@ -414,15 +414,17 @@ class ComputationChecker { const NnetComputation &computation); void Check(); // call this only once. private: - // various dimension consistency checks and checks on properties. + // Various dimension consistency checks and checks on properties. void CheckComputationIndexes() const; - // checks for a situation where an undefined variable is read. + // Checks for a situation where an undefined variable is read. void CheckComputationUndefined() const; - // checks that all writes are done before reads. details with implementation. + // Checks that all writes are done before reads. details with implementation. void CheckComputationRewrite() const; - // check matrix accesses make sense. + // Check matrix accesses make sense. void CheckComputationMatrixAccesses() const; - // check debug_info has the correct size, if used. + // Some checks related to the kCompressMatrix and kUncompressMatrix commands. + void CheckComputationCompression() const; + // Check debug_info has the correct size, if used. void CheckComputationDebugInfo() const; const CheckComputationOptions &config_; diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc index 77facbdba79..405faa56ede 100644 --- a/src/nnet3/nnet-computation.cc +++ b/src/nnet3/nnet-computation.cc @@ -282,6 +282,10 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) { command_type = kAddToRowsMulti; } else if (command_type_str == "kAddRowRanges") { command_type = kAddRowRanges; + } else if (command_type_str == "kCompressMatrix") { + command_type = kCompressMatrix; + } else if (command_type_str == "kUncompressMatrix") { + command_type = kUncompressMatrix; } else if (command_type_str == "kAcceptInput") { command_type = kAcceptInput; } else if (command_type_str == "kProvideOutput") { @@ -375,6 +379,12 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const { case kAddRowRanges: os << "kAddRowRanges\n"; break; + case kCompressMatrix: + os << "kCompressMatrix\n"; + break; + case kUncompressMatrix: + os << "kUncompressMatrix\n"; + break; case kAcceptInput: os << "kAcceptInput\n"; break; @@ -689,7 +699,7 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const { } void NnetComputation::Read(std::istream &is, bool binary) { - int32 version = 4, // must be in sync with 'version' in Write. + int32 version = 5, // must be in sync with 'version' in Write. version_in = 1; // defaults to 1 if no version specified. ExpectToken(is, binary, ""); @@ -823,7 +833,7 @@ void NnetComputation::Read(std::istream &is, bool binary) { } void NnetComputation::Write(std::ostream &os, bool binary) const { - int32 version = 4; // Must be in sync with version in Read. + int32 version = 5; // Must be in sync with version in Read. WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, version); diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index 4b1386a1f01..01c51e8e822 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -232,6 +232,13 @@ struct ComputationRequest { indexes_ranges[arg3]. We use the "alpha" as if AddRowRanges() accepted that argument, even though it doesn't (we fake it using other calls, if alpha != 1.0). + - kCompressMatrix: Compresses the matrix which should be referred to + by submatrix-index arg1. arg2 is a number that determines the + compression type (it's converted from the enum CuCompressed + MatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha + determines the 'range' parameter (c.f. NewCuCompressedMatrix()). + - kUncompressMatrix: Uncompresses the matrix which is referred to + by submatrix-index arg1 (it should previously have been compressed). - kAcceptInput: accepts a matrix of input from the user, which may be either features, or derivatives w.r.t. the output. arg1 is the submatrix index of a whole matrix that the input goes to, and arg2 is the index of the network @@ -263,7 +270,8 @@ enum CommandType { kPropagate, kBackprop, kBackpropNoModelUpdate, kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows, kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti, - kAddRowRanges, kAcceptInput, kProvideOutput, + kAddRowRanges, kCompressMatrix, kUncompressMatrix, + kAcceptInput, kProvideOutput, kNoOperation, kNoOperationPermanent, kNoOperationMarker, kNoOperationLabel, kGotoLabel }; diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 23286211301..ad63043d851 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -382,6 +382,39 @@ void NnetComputer::ExecuteCommand() { } break; } + case kCompressMatrix: { + // This does nothing if CUDA is not in use. +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + if (compressed_matrices_.empty()) + compressed_matrices_.resize(matrices_.size(), NULL); + int32 m = computation_.submatrices[c.arg1].matrix_index; + KALDI_ASSERT(compressed_matrices_[m] == NULL && + matrices_[m].NumRows() != 0); + compressed_matrices_[m] = NewCuCompressedMatrix( + static_cast(c.arg2), c.alpha); + compressed_matrices_[m]->CopyFromMat(matrices_[m]); + matrices_[m].Resize(0, 0); + } +#endif + } + case kUncompressMatrix: { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + int32 m = computation_.submatrices[c.arg1].matrix_index; + CuCompressedMatrixBase *compressed_matrix = + compressed_matrices_[m]; + KALDI_ASSERT(compressed_matrix != NULL && + matrices_[m].NumRows() == 0); + matrices_[m].Resize(compressed_matrix->NumRows(), + compressed_matrix->NumCols(), + kUndefined); + compressed_matrix->CopyToMat(&(matrices_[m])); + delete compressed_matrix; + compressed_matrices_[m] = NULL; + } +#endif + } case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker: case kNoOperationLabel: break; diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h index 869dd107bf6..19af856bad8 100644 --- a/src/nnet3/nnet-compute.h +++ b/src/nnet3/nnet-compute.h @@ -163,6 +163,14 @@ class NnetComputer { // NULL). std::vector memos_; + // This is only used when commands kCompressMatrix and kUncompressMatrix are + // invoked. It will be (the first time we compress a matrix) resized to be + // the same size as 'matrices_' (i.e., indexed by matrix index). When we + // compress a matrix m we set compressed_matrices_[m] to a non-NULL value and + // resize matrices_[m] to empty; and when we uncompress it, the reverse + // happens. + std::vector compressed_matrices_; + // executes the command in computation_.commands[program_counter_]. void ExecuteCommand(); diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 2a0b2dcd499..0d64165efe4 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -4042,5 +4042,310 @@ void RemoveCommandsForUnusedMatrix(const Analyzer &analyzer, } } + + +// This comparison operator is used in the function InsertCommands() +// to sort a list of these pairs by the .first element. +struct CommandPairComparator { + // operator () should be viewed as a '<' operator that only looks at + // the .first element, treating the .second elements as equal. + bool operator () (const std::pair &p1, + const std::pair &p2) const { + return p1.first < p2.first; + } +}; + +void InsertCommands( + std::vector > *new_commands, + NnetComputation *computation) { + int32 num_new_commands = new_commands->size(), + num_old_commands = computation->commands.size(); + if (num_new_commands == 0) + return; + CommandPairOperator comparison_operator; + // use std::stable_sort so that for entries in 'new_commands' that + // have the same .first value, they stay in the same order they were + // in before sorting. + std::stable_sort(new_commands->begin(), new_commands->end(), + comparison_operator); + + if (RandInt(0, 3) == 0) { // check 'new_commands' + for (int32 i = 0; i + 1 < num_new_commands; i++) { + KALDI_ASSERT((*new_commands)[i].first <= (*new_commands)[i+1].first && + (*new_commands)[i].first >= 0 && + (*new_commands)[i+1].first <= num_old_commands); + } + } + std::vector merged_commands; + merged_commands.reserve(num_old_commands + num_new_commands); + + std::vector >::const_iterator + new_commands_iter = new_commands->begin(), + new_commands_end = new_commands->end(); + + for (int32 old_command_index = 0; old_command_index <= num_old_commands; + old_command_index++) { + while (new_commands_iter != new_commands_end && + new_commands_iter->first <= old_command_index) { + merged_commands.push_back(new_commands_iter->second); + ++new_commands_iter; + } + if (old_command_index < num_old_commands) + merged_commands.push_back(computation->commands[old_command_index]); + } + KALDI_ASSERT(merged_commands.size() == num_old_commands + + num_new_commands); + // copy to 'computation->commands' via shallow swap. + computation->commands.swap(merged_commands); + FixGotoLabel(computation); +} + +/** + This class is used in the function OptimizeMemoryCompression(), + once we determine that there is some potential to do memory compression + for this computation. + */ +class MemoryCompressionOptimizer { + public: + + /** @param [in] nnet The neural net the computation is for. + @param [in] memory_compression_level. The level of compression: + 0 = no compression (the constructor should not be calle with this value). + 1 = compression that doesn't affect the results (but still takes time). + 2 = compression that affects the results only very slightly + 3 = compression that affects the results a little more. + @param [in] middle_command Must be the command-index of the + command of type kNoOperationMarker in 'computation'. + @param [in,out] computation The computation we're optimizing. + */ + MemoryCompressionOptimizer(const Nnet &nnet, + int32 memory_compression_level, + int32 middle_command, + NnetComputation *computation): + nnet_(nnet), memory_compression_level_(memory_compression_level), + middle_command_(middle_command), computation_(computation) { } + + void Optimize(); + private: + + // This function, called from Compress(), figures out whether we can compress + // matrix m, and if so, adds an entry to compress_info_. + void ProcessMatrix(int32 m); + + // This function modifies the commands in '*computation_', taking + // as input the commands in compress_info_. + void ModifyComputation(); + + // While deciding what matrices to compress we will create a list of structs + // of type MatrixCompressInfo. Later we copy-and-modify the commands in the + // computation, putting the compression commands into their appropriate place. + struct MatrixCompressInfo { + // m is the matrix-index of the matrix we're going to compress. + int32 m; + // compression_command_index is the command-index of the command + // *after* which we will place the compression command. Normally + // this will be some type of propagation. + int32 compression_command_index; + // compression_command_index is the command-index of the command + // *before* which we will place the uncompression command. Normally + // this will be some type of backprop. + int32 uncompression_command_index; + // 'compression_type' (e.g. kCompressedMatrixInt8) determines the type + // we compress the BaseFloats to. + CuCompressedMatrixType compression_type; + // 'range' determines range of values that the compressed values can + // be in: for signed types they are in [-range, range], for unsigned + // types, in [0, range]. + // As a special case, range = 0 means that the compression just stores the + // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this + // is useful for ReLUs. + BaseFloat range; + + MatrixCompressInfo(int32 m, int32 forward_command_index, + int32 backward_command_index, + CuCompressedMatrixType compression_type, + BaseFloat range): + m(m), compression_command_index(forward_command_index), + uncompression_command_index(backward_command_index), + compression_type(compression_type), range(range) { } + + }; + std::vector compress_info_; + + const Nnet &nnet_; + int32 memory_compression_level_; + NnetComputation *computation_; + Analyzer analyzer_; +}; + + +void MemoryCompressionOptimizer::ModifyComputation() { + int32 cur_num_commands = computation_->commands.size(); + + // whole_submatrices[m] is the submatrix-index of the submatrix that + // represents the whole of matrix m. + std::vector whole_submatrices; + computation_->GetWholeSubmatrices(&whole_submatrices); + + // 'pairs_to_insert' will be a list of pairs (command-index, command), + // meaning: (command-index just before which to insert this command; command + // to insert). + std::vector > + pairs_to_insert; + pairs_to_insert.reserve(compress_info_.size() * 2); + for (size_t i = 0; i < compress_info_.size(); i++) { + const MatrixCompressInfo &info = compress_info_[i]; + int32 s = whole_submatrices[info.m]; + // below we use compression_command_index + 1 because we want the + // compression to go after the command in 'info.compression_command_index' + // (which might be, for instance, a forward propagation command). + std::pair p1( + info.compression_command_index + 1, + NnetComputation::Command(info.range, kCompressMatrix, + s, static_cast(info.compression_type))); + pairs_to_insert.push_back(p1); + std::pair p2( + info.uncompression_command_index, + NnetComputation::Command(1.0, kUncompressMatrix, s)); + pairs_to_insert.push_back(p2); + } + InsertCommands(&pairs_to_insert, + computation_); +} + + +void MemoryCompressionOptimizer::Optimize() { + analyzer_.Init(nnet_, *computation_); + // note: matrix zero is not really a matrix. + int32 num_matrices = computation_->matrices.size(); + for (int32 m = 1; m < num_matrices; m++) + ProcessMatrix(m); + if (!compress_info_.empty()) + ModifyComputatin(); +} + +void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { + // 'accesses' list the commands that access this matrix. + const std::vector &accesses = analyzer_.matrix_accesses[m].accesses; + Access middle_access; + middle_access.command_index = middle_command_; + std::vector::const_iterator iter = std::lower_bound(accesses.begin(), + accesses.end(), + middle_access); + // At this point, 'iter' points to the first access in 'accesses' + // whose command index is >= 'middle_command_' (which separates the forward + // and backward passes), or accesses.end() if this matrix was not + // accessed during the backward pass. + if (iter == accesses.end()) { + return; // There is nothing to do: this matrix was not accessed during the + // backward pass. + } + if (iter == accesses.begin()) { + return; // There is nothing to do: this matrix was not accessed during the + // forward pass. + } + // 'backward_access' is the first access of the matrix in the backward + // pass of the computation, and + // 'forward_access' is the last access of the matrix in the forward pass + // of the computation. + const Access &backward_access = iter[0], + &forward_access = iter[-1]; + KALDI_ASSERT(forward_access.command_index < middle_command_ && + backward_access.command_index > middle_command_); + // 'backward_access_is_last_access' is going to be set to true if + // 'backward_access' is the last command to access the matrix (apart from + // deallocation commands). + bool backward_access_is_last_access = false; + if (accesses.end() - backward_access <= 2) { + // if there is at most 1 command after 'backward_access'... + const Access &next_access = iter[1]; + NnetComputation::Command &next_command = + computation_->commands[next_access.command_index]; + if (next_command.command_type == kDeallocMatrix || + next_command.command_type == kSwapMatrix) + backward_access_is_last_access = true; + } + int32 backward_command_index = backward_access.command_index, + forward_command_index = forward_access.command_index; + NnetComputation::Command + &forward_command = computation_->commands[forward_command_index], + &backward_command = computation_->commands[backward_command_index]; + + if (memory_compression_level_ >= 1 && + backward_access_is_last_access && + forward_access.access_type == kWriteAccess && + backward_access.access_type == kReadAccess && + forward_command.command_type == kPropagate && + backward_command.command_type == kBackprop) { + int32 component_index = backward_access.arg1; + const Component *component = nnet_.GetComponent(component_index); + // this is potentially a candidate for our optimization for ReLU units, + // where we only store the sign. + if (component->Type() == "RectifiedLinearComponent" && + component_index == forward_access.arg1) { + compress_info_.push_back( + MatrixCompressInfo(m, forward_command_index, + backward_command_index, + kCompressedMatrixUint8, 0.0)); + return; + } + } + + // TODO: we can later implement compression for other cases. + // + +} + + + + +void OptimizeMemoryCompression(const Nnet &nnet, + int32 memory_compression_level, + NnetComputation *computation) { + if (memory_compression_level == 0 || computation->commands.empty()) + return; + // don't apply this optimization to looped computations. + if (computation->commands.back().command_type == kGotoLabel) + return; + + // 'middle_command' will be the index of the command of type + // 'kNoOperationMarker' that separates the forward and backward + // passes. If it doesn't exist, it means this computation doesn't + // include + int32 middle_command = -1; + for (size_t i = 0; i < computation->commands.size(); i++) { + if (computation->commands[i].command_type == kNoOperationMarker) { + if (middle_command < 0) { + middle_command = static_cast(i); + } else { + KALDI_WARN << "Found more than one command of tyep kNoOperationMarker " + "in non-looped computation."; + // there are more than one command of this type... this wasn't expected. + return false; + } + } + } + if (middle_command == -1) { + return; // This computation doesn't have a backprop pass. + } + if (memory_compression_level >= 1) { + int64 bytes_used_initial, bytes_used_final; + if (GetVerboseLevel() >= 2) + bytes_used_initial = GetMaxMemoryUse(*computation); + + MemoryCompressionOptimizer opt(nnet, memory_compression_level, + middle_command, computation); + opt.Optimize(); + + if (GetVerboseLevel() >= 2) { + bytes_used_final = GetMaxMemoryUse(*computation); + KALDI_VLOG(2) << "Memory compression reduced memory use from " + << bytes_used_initial << " to " + << bytes_used_final << " bytes."; + } + } +} + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 98615e2e146..1ffcc330adf 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -524,6 +524,46 @@ void IdentifyIndexesArgs(std::vector *commands, void IdentifyIndexesRangesArgs(std::vector *commands, std::vector *indexes_ranges_args); +/// Inserts commands into the computation at the requested places. 'commands' +/// is a list of pairs (command-index, command) that is expected to be sorted +/// on command-index. For each entry (c, command) in 'commands', 'command' is +/// inserted into 'computation' just *before* the command that (at entry) is in +/// computation->commands[c]. If there are multiple pairs with the same index +/// c, they will remain in the same order in which they were present in +/// 'commands'; however, 'commands' does not have to be sorted on 'c'. +/// As a special case, if c == computation->commands.size(), the +/// corresponding commands are inserted at the beginning of the computation. +/// This function will appropriately renumber the argument of the kGotoLabel +/// command of any 'looped' computation. Command indexes c in commands[*].first +/// must be in the range [0, computation->commands.size()]. +/// This function may modify 'commands' by sorting it. +void InsertCommands( + std::vector > *commands, + NnetComputation *computation); + +/// Performs optimization to reduce memory usage where possible, +/// making use of the kCompressMatrix and kUncompressMatrix commands. +/// Should only be done after most other optimizations, because some +/// optimizations (such as variable-merging) would not work correctly +/// after doing this optimization. This does nothing for looped +/// computations. It's OK, though, to expand a shortcut computation +/// (i.e. call ExpandComputation) after doing this. +/// +/// memory_compression_level determines how aggressive the compression +/// is. Allowed values: +/// 0 = no compression at all +/// 1 = compression that doesn't affect results (e.g. compress +/// ReLU outputs to 1 byte, as just the sign is needed). +/// 2 = compression that may affect the results slightly (e.g. 16-bit +/// compression of the output of NormalizeComponent and the like), +/// but this is not implemented yet, so equivalent to 1. +/// 3 = compression that may affect the results more than just +/// slightly. Not implemented yet, so equivalent to 1. +void OptimizeMemoryCompression(const Nnet &nnet, + int32 memory_compression_level, + NnetComputation *computation); + + /// This function tries to optimize computation 'computation' for an 'looped' /// computation. It expects as input a computation with no backprop but with /// multiple 'segments' separated by command kNoOperationLabel, where each diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 0eb5de2c4fc..3dff8c0a4f3 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -554,7 +554,7 @@ void Optimize(const NnetOptimizeOptions &config, // the looped computation optimization has to go before // 'RemoveUnnecessaryAllocation()'. We don't gate this by 'config.optimize' // because it's necessary for looped computation to run. - if (config.optimize_looped_computation){ + if (config.optimize_looped_computation) { OptimizeLoopedComputation(nnet, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); @@ -579,6 +579,15 @@ void Optimize(const NnetOptimizeOptions &config, if (config.optimize_looped_computation) FixGotoLabel(computation); + + if (config.memory_compression_level > 0 && + !config.optimize_looped_computation) { + OptimizeMemoryCompression(nnet_, config.memory_compression_level, + computation); + if (GetVerboseLevel() >= 3) + CheckComputation(nnet, *computation, false); + } + if (GetVerboseLevel() >= 3) { CheckComputation(nnet, *computation, false); KALDI_LOG << "After optimization, max memory use (bytes) = " diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index 4ffa4de449e..d3ecf01847a 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -32,6 +32,7 @@ namespace nnet3 { // Options class for optimizing a NnetComputation. The main projected use for // this is in debugging the optimization code itself, so that if an error is // detected, we can work out which optimization was responsible for the error. +// See the Register() function below for option-specific documentation. struct NnetOptimizeOptions { bool optimize; // setting this false disallow all optimization. bool consolidate_model_update; @@ -49,6 +50,7 @@ struct NnetOptimizeOptions { int32 max_deriv_time; int32 max_deriv_time_relative; bool snip_row_ops; + int32 memory_compression_level; // optimize_looped_computation is a 'hidden config' not available from // the command line; it's set to true to enable the optimization for // looped computation that turns a linear computation into a loop. @@ -71,6 +73,7 @@ struct NnetOptimizeOptions { max_deriv_time(std::numeric_limits::max()), max_deriv_time_relative(std::numeric_limits::max()), snip_row_ops(true), + memory_compression_level(0), optimize_looped_computation(false) { } void Register(OptionsItf *opts) { @@ -123,6 +126,16 @@ struct NnetOptimizeOptions { opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to " "disable an optimization that reduces the size of certain " "per-row operations"); + opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to " + "disable an optimization that reduces the size of certain " + "per-row operations"); + opts->Register("memory-compression-level", &memory_compression_level, + "This is only relevant to training, not decoding. Set this " + "to 0,1,2,3; higher levels are more aggressive at reducing " + "memory by compressing quantities needed for backprop, " + "potentially at the expense of speed and the accuracy " + "of derivatives. 0 means no compression at all."); + } void Read(std::istream &is, bool binary); void Write(std::ostream &os, bool binary) const; From 3031e26e6744f8454c268696829285cc3ff6bae1 Mon Sep 17 00:00:00 2001 From: Zhehuai Chen Date: Thu, 25 Jan 2018 18:27:28 -0500 Subject: [PATCH 089/184] [src] Optimization to decoders for speed (#2168) --- src/decoder/lattice-faster-decoder.cc | 62 +++++++++++++---- src/decoder/lattice-faster-decoder.h | 11 ++- src/decoder/lattice-faster-online-decoder.cc | 72 ++++++++++++++++---- src/decoder/lattice-faster-online-decoder.h | 13 +++- 4 files changed, 126 insertions(+), 32 deletions(-) diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc index c5c9aae743c..963430a63f1 100644 --- a/src/decoder/lattice-faster-decoder.cc +++ b/src/decoder/lattice-faster-decoder.cc @@ -3,6 +3,7 @@ // Copyright 2009-2012 Microsoft Corporation Mirko Hannemann // 2013-2014 Johns Hopkins University (Author: Daniel Povey) // 2014 Guoguo Chen +// 2018 Zhehuai Chen // See ../../COPYING for clarification regarding multiple authors // @@ -68,7 +69,7 @@ void LatticeFasterDecoder::InitDecoding() { active_toks_[0].toks = start_tok; toks_.Insert(start_state, start_tok); num_toks_++; - ProcessNonemitting(config_.beam); + ProcessNonemittingWrapper(config_.beam); } // Returns true if any kind of traceback is available (not necessarily from @@ -84,8 +85,8 @@ bool LatticeFasterDecoder::Decode(DecodableInterface *decodable) { while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) { if (NumFramesDecoded() % config_.prune_interval == 0) PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); + BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable); + ProcessNonemittingWrapper(cost_cutoff); } FinalizeDecoding(); @@ -588,8 +589,8 @@ void LatticeFasterDecoder::AdvanceDecoding(DecodableInterface *decodable, if (NumFramesDecoded() % config_.prune_interval == 0) { PruneActiveTokens(config_.lattice_beam * config_.prune_scale); } - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); + BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable); + ProcessNonemittingWrapper(cost_cutoff); } } @@ -683,6 +684,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count, } } +template BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) { KALDI_ASSERT(active_toks_.size() > 0); int32 frame = active_toks_.size() - 1; // frame is the frame-index @@ -707,6 +709,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) { BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good // dynamic range. + const FstType &fst = dynamic_cast(fst_); // First process the best token to get a hopefully // reasonably tight bound on the next cutoff. The only @@ -715,15 +718,13 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) { StateId state = best_elem->key; Token *tok = best_elem->val; cost_offset = - tok->tot_cost; - for (fst::ArcIterator > aiter(fst_, state); + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); aiter.Next()) { - Arc arc = aiter.Value(); + const Arc &arc = aiter.Value(); if (arc.ilabel != 0) { // propagate.. - arc.weight = Times(arc.weight, - Weight(cost_offset - - decodable->LogLikelihood(frame, arc.ilabel))); - BaseFloat new_weight = arc.weight.Value() + tok->tot_cost; + BaseFloat new_weight = arc.weight.Value() + cost_offset - + decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost; if (new_weight + adaptive_beam < next_cutoff) next_cutoff = new_weight + adaptive_beam; } @@ -744,7 +745,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) { StateId state = e->key; Token *tok = e->val; if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator > aiter(fst_, state); + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); @@ -775,12 +776,31 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) { return next_cutoff; } +template BaseFloat LatticeFasterDecoder::ProcessEmitting>( + DecodableInterface *decodable); +template BaseFloat LatticeFasterDecoder::ProcessEmitting>( + DecodableInterface *decodable); +template BaseFloat LatticeFasterDecoder::ProcessEmitting>( + DecodableInterface *decodable); + +BaseFloat LatticeFasterDecoder::ProcessEmittingWrapper(DecodableInterface *decodable) { + if (fst_.Type() == "const") { + return LatticeFasterDecoder::ProcessEmitting>(decodable); + } else if (fst_.Type() == "vector") { + return LatticeFasterDecoder::ProcessEmitting>(decodable); + } else { + return LatticeFasterDecoder::ProcessEmitting>(decodable); + } +} + +template void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) { KALDI_ASSERT(!active_toks_.empty()); int32 frame = static_cast(active_toks_.size()) - 2; // Note: "frame" is the time-index we just processed, or -1 if // we are processing the nonemitting transitions before the // first frame (called from InitDecoding()). + const FstType &fst = dynamic_cast(fst_); // Processes nonemitting arcs for one frame. Propagates within toks_. // Note-- this queue structure is is not very optimal as @@ -812,7 +832,7 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) { // but since most states are emitting it's not a huge issue. tok->DeleteForwardLinks(); // necessary when re-visiting tok->links = NULL; - for (fst::ArcIterator > aiter(fst_, state); + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); @@ -837,6 +857,22 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) { } // while queue not empty } +template void LatticeFasterDecoder::ProcessNonemitting>( + BaseFloat cutoff); +template void LatticeFasterDecoder::ProcessNonemitting>( + BaseFloat cutoff); +template void LatticeFasterDecoder::ProcessNonemitting>( + BaseFloat cutoff); + +void LatticeFasterDecoder::ProcessNonemittingWrapper(BaseFloat cost_cutoff) { + if (fst_.Type() == "const") { + return LatticeFasterDecoder::ProcessNonemitting>(cost_cutoff); + } else if (fst_.Type() == "vector") { + return LatticeFasterDecoder::ProcessNonemitting>(cost_cutoff); + } else { + return LatticeFasterDecoder::ProcessNonemitting>(cost_cutoff); + } +} void LatticeFasterDecoder::DeleteElems(Elem *list) { for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h index fd1b2779fe1..56e4af1b95b 100644 --- a/src/decoder/lattice-faster-decoder.h +++ b/src/decoder/lattice-faster-decoder.h @@ -3,6 +3,7 @@ // Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; // 2013-2014 Johns Hopkins University (Author: Daniel Povey) // 2014 Guoguo Chen +// 2018 Zhehuai Chen // See ../../COPYING for clarification regarding multiple authors // @@ -339,12 +340,18 @@ class LatticeFasterDecoder { /// Processes emitting arcs for one frame. Propagates from prev_toks_ to cur_toks_. /// Returns the cost cutoff for subsequent ProcessNonemitting() to use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); + /// Templated on FST type for speed; called via ProcessEmittingWrapper(). + template BaseFloat ProcessEmitting(DecodableInterface *decodable); + + BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable); /// Processes nonemitting (epsilon) arcs for one frame. Called after /// ProcessEmitting() on each frame. The cost cutoff is computed by the /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); + /// the templated design is similar to ProcessEmitting() + template void ProcessNonemitting(BaseFloat cost_cutoff); + + void ProcessNonemittingWrapper(BaseFloat cost_cutoff); // HashList defined in ../util/hash-list.h. It actually allows us to maintain // more than one list (e.g. for current and previous frames), but only one of diff --git a/src/decoder/lattice-faster-online-decoder.cc b/src/decoder/lattice-faster-online-decoder.cc index cd7b564b721..5fb2ef25a3d 100644 --- a/src/decoder/lattice-faster-online-decoder.cc +++ b/src/decoder/lattice-faster-online-decoder.cc @@ -4,6 +4,7 @@ // 2013-2014 Johns Hopkins University (Author: Daniel Povey) // 2014 Guoguo Chen // 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2018 Zhehuai Chen // See ../../COPYING for clarification regarding multiple authors // @@ -68,7 +69,7 @@ void LatticeFasterOnlineDecoder::InitDecoding() { active_toks_[0].toks = start_tok; toks_.Insert(start_state, start_tok); num_toks_++; - ProcessNonemitting(config_.beam); + ProcessNonemittingWrapper(config_.beam); } // Returns true if any kind of traceback is available (not necessarily from @@ -84,8 +85,8 @@ bool LatticeFasterOnlineDecoder::Decode(DecodableInterface *decodable) { while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) { if (NumFramesDecoded() % config_.prune_interval == 0) PruneActiveTokens(config_.lattice_beam * config_.prune_scale); - BaseFloat cost_cutoff = ProcessEmitting(decodable); // Note: the value returned by - ProcessNonemitting(cost_cutoff); + BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable); // Note: the value returned by + ProcessNonemittingWrapper(cost_cutoff); } FinalizeDecoding(); @@ -763,8 +764,8 @@ void LatticeFasterOnlineDecoder::AdvanceDecoding(DecodableInterface *decodable, PruneActiveTokens(config_.lattice_beam * config_.prune_scale); } // note: ProcessEmitting() increments NumFramesDecoded(). - BaseFloat cost_cutoff = ProcessEmitting(decodable); - ProcessNonemitting(cost_cutoff); + BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable); + ProcessNonemittingWrapper(cost_cutoff); } } @@ -861,6 +862,7 @@ BaseFloat LatticeFasterOnlineDecoder::GetCutoff(Elem *list_head, size_t *tok_cou } +template BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting( DecodableInterface *decodable) { KALDI_ASSERT(active_toks_.size() > 0); @@ -883,6 +885,7 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting( BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good // dynamic range. + const FstType &fst = dynamic_cast(fst_); // First process the best token to get a hopefully // reasonably tight bound on the next cutoff. The only @@ -891,15 +894,13 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting( StateId state = best_elem->key; Token *tok = best_elem->val; cost_offset = - tok->tot_cost; - for (fst::ArcIterator > aiter(fst_, state); + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); aiter.Next()) { - Arc arc = aiter.Value(); + const Arc &arc = aiter.Value(); if (arc.ilabel != 0) { // propagate.. - arc.weight = Times(arc.weight, - Weight(cost_offset - - decodable->LogLikelihood(frame, arc.ilabel))); - BaseFloat new_weight = arc.weight.Value() + tok->tot_cost; + BaseFloat new_weight = arc.weight.Value() + cost_offset - + decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost; if (new_weight + adaptive_beam < next_cutoff) next_cutoff = new_weight + adaptive_beam; } @@ -919,8 +920,8 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting( // loop this way because we delete "e" as we go. StateId state = e->key; Token *tok = e->val; - if (tok->tot_cost <= cur_cutoff) { - for (fst::ArcIterator > aiter(fst_, state); + if (tok->tot_cost <= cur_cutoff) { + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); @@ -951,12 +952,35 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting( return next_cutoff; } +template BaseFloat LatticeFasterOnlineDecoder:: + ProcessEmitting>(DecodableInterface *decodable); +template BaseFloat LatticeFasterOnlineDecoder:: + ProcessEmitting>(DecodableInterface *decodable); +template BaseFloat LatticeFasterOnlineDecoder:: + ProcessEmitting>(DecodableInterface *decodable); + +BaseFloat LatticeFasterOnlineDecoder::ProcessEmittingWrapper( + DecodableInterface *decodable) { + if (fst_.Type() == "const") { + return LatticeFasterOnlineDecoder:: + ProcessEmitting>(decodable); + } else if (fst_.Type() == "vector") { + return LatticeFasterOnlineDecoder:: + ProcessEmitting>(decodable); + } else { + return LatticeFasterOnlineDecoder:: + ProcessEmitting>(decodable); + } +} + +template void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) { KALDI_ASSERT(!active_toks_.empty()); int32 frame = static_cast(active_toks_.size()) - 2; // Note: "frame" is the time-index we just processed, or -1 if // we are processing the nonemitting transitions before the // first frame (called from InitDecoding()). + const FstType &fst = dynamic_cast(fst_); // Processes nonemitting arcs for one frame. Propagates within toks_. // Note-- this queue structure is is not very optimal as @@ -988,7 +1012,7 @@ void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) { // but since most states are emitting it's not a huge issue. tok->DeleteForwardLinks(); // necessary when re-visiting tok->links = NULL; - for (fst::ArcIterator > aiter(fst_, state); + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); @@ -1013,6 +1037,26 @@ void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) { } // while queue not empty } +template void LatticeFasterOnlineDecoder:: + ProcessNonemitting>(BaseFloat cutoff); +template void LatticeFasterOnlineDecoder:: + ProcessNonemitting>(BaseFloat cutoff); +template void LatticeFasterOnlineDecoder:: + ProcessNonemitting>(BaseFloat cutoff); + +void LatticeFasterOnlineDecoder::ProcessNonemittingWrapper( + BaseFloat cost_cutoff) { + if (fst_.Type() == "const") { + return LatticeFasterOnlineDecoder:: + ProcessNonemitting>(cost_cutoff); + } else if (fst_.Type() == "vector") { + return LatticeFasterOnlineDecoder:: + ProcessNonemitting>(cost_cutoff); + } else { + return LatticeFasterOnlineDecoder:: + ProcessNonemitting>(cost_cutoff); + } +} void LatticeFasterOnlineDecoder::DeleteElems(Elem *list) { for (Elem *e = list, *e_tail; e != NULL; e = e_tail) { diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h index b69b5492fb7..6cf0503d891 100644 --- a/src/decoder/lattice-faster-online-decoder.h +++ b/src/decoder/lattice-faster-online-decoder.h @@ -3,6 +3,7 @@ // Copyright 2009-2013 Microsoft Corporation; Mirko Hannemann; // 2013-2014 Johns Hopkins University (Author: Daniel Povey) // 2014 Guoguo Chen +// 2018 Zhehuai Chen // See ../../COPYING for clarification regarding multiple authors // @@ -337,12 +338,18 @@ class LatticeFasterOnlineDecoder { /// Processes emitting arcs for one frame. Propagates from prev_toks_ to cur_toks_. /// Returns the cost cutoff for subsequent ProcessNonemitting() to use. - BaseFloat ProcessEmitting(DecodableInterface *decodable); + /// Templated on FST type for speed; called via ProcessEmittingWrapper(). + template BaseFloat ProcessEmitting(DecodableInterface *decodable); + + BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable); /// Processes nonemitting (epsilon) arcs for one frame. Called after /// ProcessEmitting() on each frame. The cost cutoff is computed by the /// preceding ProcessEmitting(). - void ProcessNonemitting(BaseFloat cost_cutoff); + /// the templated design is similar to ProcessEmitting() + template void ProcessNonemitting(BaseFloat cost_cutoff); + + void ProcessNonemittingWrapper(BaseFloat cost_cutoff); // HashList defined in ../util/hash-list.h. It actually allows us to maintain // more than one list (e.g. for current and previous frames), but only one of @@ -361,7 +368,7 @@ class LatticeFasterOnlineDecoder { // make it class member to avoid internal new/delete. const fst::Fst &fst_; bool delete_fst_; - std::vector cost_offsets_; // This contains, for each + std::vector cost_offsets_; // This contains, for each // frame, an offset that was added to the acoustic log-likelihoods on that // frame in order to keep everything in a nice dynamic range i.e. close to // zero, to reduce roundoff errors. From ee518b106e4dc334b3c02752a1bda01f66abb120 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 27 Jan 2018 00:54:46 -0500 Subject: [PATCH 090/184] [src] Fixes to compression and matrix-extend code; started work on CUDA stuff. --- src/cudamatrix/cu-compressed-matrix.h | 17 +- src/cudamatrix/cu-kernels-ansi.h | 16 ++ src/cudamatrix/cu-kernels.cu | 52 +++++ src/cudamatrix/cu-kernels.h | 24 +++ src/cudamatrix/cu-matrix-lib.h | 1 + src/nnet3/nnet-analyze.cc | 24 +-- src/nnet3/nnet-optimize-utils.cc | 264 ++++++++++++++++++++++++-- src/nnet3/nnet-optimize-utils.h | 10 + src/nnet3/nnet-optimize.cc | 10 +- src/nnet3/nnet-optimize.h | 5 + 10 files changed, 381 insertions(+), 42 deletions(-) diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h index 557892ae266..0be1bb391fb 100644 --- a/src/cudamatrix/cu-compressed-matrix.h +++ b/src/cudamatrix/cu-compressed-matrix.h @@ -63,13 +63,15 @@ class CuCompressedMatrixBase { reduce memory use for large networks. It is *not* a CUDA equivalent for class CompressedMatrix (of - ../matrix/compressed-matrix.h). + ../matrix/compressed-matrix.h). Note: this class is only to be used when you + are using a GPU. If you didn't compile for CUDA or you are not using a GPU, + you are not supposed to create an instance of this class, and doing so will + cause a runtime error. */ template class CuCompressedMatrix: public CuCompressedMatrixBase { public: - /// Constructor which sets 'scale_' according to /// scale_ = range / std::numeric_limits::max(). /// @@ -90,6 +92,8 @@ class CuCompressedMatrix: public CuCompressedMatrixBase { ~CuCompressedMatrix(); private: + // If there was data in 'data_', frees it, and sets it to NULL. + void Destroy(); // The raw data. I *data_; @@ -117,12 +121,12 @@ class CuCompressedMatrix: public CuCompressedMatrixBase { // This enum value is used to encode the type you want to instantiate // a CuCompressedMatrix with. It's used in class NnetComputation // (cast to int32) as one of the arguments of kCompressMatrix. -enum { +enum CuCompressedMatrixType { kCompressedMatrixInt8 = 1, kCompressedMatrixUint8 = 2, kCompressedMatrixInt16 = 3, kCompressedMatrixUint16 = 4 -} CuCompressedMatrixType; +}; /** This function allocates a new CuCompressedMatrix with type determined @@ -135,9 +139,6 @@ CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, BaseFloat range); - - - - +} // namespace kaldi #endif diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 7d2db9adcc9..8a95ca09537 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -30,6 +30,15 @@ #if HAVE_CUDA == 1 extern "C" { +// "C" version of the BaseFloat typedef-- this saves us having to write +// multiple versions of these kernels. +#if (KALDI_DOUBLEPRECISION != 0) +typedef double BaseFloat; +#else +typedef float BaseFloat; +#endif + + void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat, const MatrixDim d, const double alpha, const double beta); @@ -736,6 +745,13 @@ void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc); + +void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src, + MatrixDim dim, int16_t *dest, + int dest_stride, double inv_scale); +void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride); + } // extern "C" #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 2f8f37224be..f62a07d8917 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -3558,6 +3558,47 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m } } + +__global__ +static void _cuda_compress_int8_sign(const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dest_index = i + j * dest_stride, + src_index = i + j * dim.stride; + if (i < d.cols && j < d.rows) { + BaseFloat f = src[src_index]; + dest[dest_index] = (f > 0.0 ? (unsigned char)1 : (unsigned char)0); + } +} + + +// this version of the function will only be used if BaseFloat is double. +__global__ +static void _cuda_compress_double_to_int16(const double *src, MatrixDim dim, + int16_t *dest, int dest_stride, + double inv_scale) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dest_index = i + j * dest_stride, + src_index = i + j * dim.stride; + int ok = (i < d.cols && j < d.rows); + if (ok) { + BaseFloat f = src[src_index]; + int i = __double2int_rn(f * inv_scale); + // note: SignedInt will be int8 or (more likely) int16. + int16_t s; + if (i < -32768) s = -32768; + else if (i > 32767) s = 32767; + else s = i; + } + __syncthreads(); + if (ok) { + dest[dest_index] = s; + } +} + + /*********************************************************************** * ANSI-C wrappers of CUDA kernels */ @@ -5220,3 +5261,14 @@ void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim, _apply_exp_special<<>>(out, out_dim, in, in_stride); } +void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride) { + _cuda_compress_int8_sign<<>>(src, dim, dest, dest_stride); +} + +void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src, + MatrixDim dim, int16_t *dest, + int dest_stride, double inv_scale) { + _cuda_compress_double_to_int16<<>>(src, dim, dest, dest_stride, + inv_scale); +} diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 27ccf760557..dba1a0516a3 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1463,6 +1463,30 @@ inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, cudaF_vec_sum(Gr, Bl, v, value, dim, inc); } +// Compresses the matrix in 'src' to 'dest', retaining only zero-one +// information (1 if the value is >0, 0 otherwise) +inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, unsigned char *dest, + int dest_stride) { + cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride); +} +// this template handles the other types that are not instantiated yet, +// to avoid compilation errors. +template +inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, I *dest, + int dest_stride) { + KALDI_ERR << "Not implemented for this type."; +} + +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, unsigned char *dest, + int dest_stride, BaseFloat inv_scale) { + cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride, inv_scale); +} + + + } // namespace kaldi #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-matrix-lib.h b/src/cudamatrix/cu-matrix-lib.h index ef21a2945f1..1da7efafc97 100644 --- a/src/cudamatrix/cu-matrix-lib.h +++ b/src/cudamatrix/cu-matrix-lib.h @@ -29,5 +29,6 @@ #include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cu-block-matrix.h" #include "cudamatrix/cu-rand.h" +#include "cudamatrix/cu-compressed-matrix.h" #endif diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index a2517989294..551b50ff6ad 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -694,8 +694,8 @@ void ComputationChecker::CheckComputationCompression() const { // 'middle_command' will be the index of the command that separates // the forward and backward passes. int32 middle_command = -1; - for (size_t i = 0; i < computation->commands.size(); i++) { - if (computation->commands[i].command_type == kNoOperationMarker) { + for (size_t i = 0; i < computation_.commands.size(); i++) { + if (computation_.commands[i].command_type == kNoOperationMarker) { middle_command = static_cast(i); break; } @@ -705,7 +705,7 @@ void ComputationChecker::CheckComputationCompression() const { int32 num_accesses = accesses.accesses.size(); for (int32 a = 0; a < num_accesses; a++) { const Access &access = accesses.accesses[a]; - int32 command_index = accesses.command_inex; + int32 command_index = access.command_index; const NnetComputation::Command &command = computation_.commands[command_index]; if (command.command_type == kUncompressMatrix) { @@ -715,7 +715,7 @@ void ComputationChecker::CheckComputationCompression() const { a > 0 && computation_.commands[ accesses.accesses[a-1].command_index].command_type == kCompressMatrix); - + } if (command.command_type == kCompressMatrix) { // check that the next access to this matrix is an uncompression // command. @@ -730,14 +730,14 @@ void ComputationChecker::CheckComputationCompression() const { // make sure there are only 2 commands after this: the uncompress // command, a relu backprop command, and a deallocation command. KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 && - num_accesses <= a + 4); - // make sure the previous access to that matrix was a ReLU - // propagation. - int32 previous_command_index = accesses.accesses[a-1].command_index; - const NnetComputation::Command &previous_command = - computation_.commands[previous_command_index]; - KALDI_ASSERT(previous_command.command_type == kPropagate && - nnet_.GetComponent(previous_command.arg1).Type() == + num_accesses == a + 4); + // make sure the next access to that matrix, apart from the + // uncompression command, is a ReLU propagation. + int32 next_command_index = accesses.accesses[a+2].command_index; + const NnetComputation::Command &next_command = + computation_.commands[next_command_index]; + KALDI_ASSERT(next_command.command_type == kBackprop && + nnet_.GetComponent(next_command.arg1)->Type() == "RectifiedLinearComponent"); } } diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 0d64165efe4..75521a43658 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -21,7 +21,6 @@ #include "nnet3/nnet-optimize-utils.h" #include "nnet3/nnet-optimize.h" - namespace kaldi { namespace nnet3 { @@ -1016,6 +1015,211 @@ std::pair VariableMergingOptimizer::MayBeMerged( } +// This class is used inside the function +// `void ExtendMatrices(NnetComputation *computation)`; +// see that function's declaration in nnet-optimize-utils.h for +// a summary of what this class does. +class MatrixExtender { + public: + typedef NnetComputation::SubMatrixInfo SubMatrixInfo; + typedef NnetComputation::MatrixInfo MatrixInfo; + + MatrixExtender(NnetComputation *computation); + + void ExtendMatrices(); + + private: + // This function returns true if a copy command from 'src_submatrix' + // to 'dest_submatrix' has the properties we need to be able to + // extend its rows to cover all of the source matrix. + bool CanBeExtended(int32 dest_submatrix_index, + int32 src_submatrix_index); + + // This actually extends the matrices... it's called only if CanBeExtended() + // with the same args returned true. It modifies 'dest_submatrix_index' + // and 'src_submatrix_index'. + void Extend(int32 *dest_submatrix_index, int32 *src_submatrix_index); + + // This function modifies the computation to fix certain problems + // that might have been introduced by Extend()... allocation, deallocation, + // + void FixComputation(); + + // don't extend a destination matrix if it wasn't already + // at least 'min_proportion' (80%) big enough to store the source. + BaseFloat min_proportion_; + + NnetComputation *computation_; + + // Indexed by matrix-index m, orig_num_rows_[m] is the value of + // computation_->matrices[m].num_rows when this class was initialized, + // i.e. before we changed anything. + std::vector orig_num_rows_; + + // Indexed by matrix-index m, this vector contains true if matrix + // m is involved in any AcceptInput() or ProvideOutput() operations. + std::vector is_input_or_output_; +}; + +MatrixExtender::MatrixExtender(NnetComputation *computation): + min_proportion_(0.8), + computation_(computation) { + int32 num_matrices = computation_->matrices.size(); + + { // set up orig_num_rows_. + orig_num_rows_.resize(num_matrices); + // matrix 0 is not a real matrix so skip that index. + for (int32 m = 1; m < num_matrices; m++) + orig_num_rows_[m] = computation_->matrices[m].num_rows; + } + { // set up is_input_or_output_. + is_input_or_output_.resize(num_matrices, false); + std::vector::iterator + command_iter = computation_->commands.begin(), + command_end = computation_->commands.end(); + for (; command_iter != command_end; ++command_iter) { + const NnetComputation::Command &command = *command_iter; + // make sure there are no kSwapMatrix commands; they should not be present + // at this stage of optimization. + KALDI_ASSERT(command.command_type != kSwapMatrix); + if (command.command_type == kProvideOutput || + command.command_type == kAcceptInput) { + int32 s = command.arg1, + m = computation_->submatrices[s].matrix_index; + is_input_or_output_[m] = true; + } + } + } +} + + +bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index, + int32 src_submatrix_index) { + const SubMatrixInfo + &src_submatrix = computation_->submatrices[src_submatrix_index], + &dest_submatrix = computation_->submatrices[dest_submatrix_index]; + if (src_submatrix.matrix_index == dest_submatrix.matrix_index) + return false; + + // we can't resize the destination matrix if it's involved in input or output. + if (is_input_or_output_[dest_submatrix.matrix_index]) + return false; + + const MatrixInfo + &src_matrix = computation_->matrices[src_submatrix.matrix_index]; + + int32 dest_matrix_num_rows = orig_num_rows_[dest_submatrix.matrix_index]; + + if (src_submatrix.num_rows < min_proportion_ * src_matrix.num_rows) + return false; + + // The following checks that the source submatrix covers be all of the + // source matrix except a few final rows, and the destination submatrix goes + // to the final row of its matrix. + return (src_submatrix.col_offset == 0 && + src_submatrix.num_cols == src_matrix.num_cols && + src_submatrix.row_offset == 0 && + src_submatrix.num_rows < src_matrix.num_rows && + dest_submatrix.row_offset + dest_submatrix.num_rows == + dest_matrix_num_rows); +} + + +void MatrixExtender::Extend(int32 *dest_submatrix_index, + int32 *src_submatrix_index) { + // copy the SubMatrixInfo to avoid iterator invalidation. + SubMatrixInfo + src_submatrix = computation_->submatrices[*src_submatrix_index], + dest_submatrix = computation_->submatrices[*dest_submatrix_index]; + + MatrixInfo &src_matrix = computation_->matrices[src_submatrix.matrix_index], + &dest_matrix = computation_->matrices[dest_submatrix.matrix_index]; + + int32 new_dest_num_rows = dest_submatrix.row_offset + src_matrix.num_rows; + + // extend the destination matrix so it has enough rows to fit the entire + // source matrix. Note: doing this will break certain invariances in the + // computation, principally with allocation and deallocation commands, which + // we'll later fix up by calling FixComputation(). + if (new_dest_num_rows > dest_matrix.num_rows) { + dest_matrix.num_rows = new_dest_num_rows; + // make sure there's a submatrix index covering the whole of the dest matrix. + computation_->submatrices.push_back( + SubMatrixInfo(dest_submatrix.matrix_index, 0, new_dest_num_rows, + 0, dest_matrix.num_cols)); + } + + // The following 3 statements create a new submatrix that will be + // the destination submatrix; it's the same as the original destination + // submatrix, but with a few extra rows. + *dest_submatrix_index = computation_->submatrices.size(); + dest_submatrix.num_rows = src_matrix.num_rows; + computation_->submatrices.push_back( + SubMatrixInfo(dest_submatrix)); + + // The following 3 statements create a new submatrix that will be + // the source submatrix; it's the same as the original source + // submatrix, but with a few extra rows, and actually will cover + // the entire source matrix. + *src_submatrix_index = computation_->submatrices.size(); + computation_->submatrices.push_back( + SubMatrixInfo(src_submatrix.matrix_index, 0, src_matrix.num_rows, + 0, src_matrix.num_cols)); +} + +void MatrixExtender::ExtendMatrices() { + std::vector::iterator + command_iter = computation_->commands.begin(), + command_end = computation_->commands.end(); + bool changed = false; + for (; command_iter != command_end; ++command_iter) { + NnetComputation::Command &command = *command_iter; + if (command.command_type == kMatrixCopy && + command.alpha == 1.0) { + int32 dest_submatrix_index = command.arg1, + src_submatrix_index = command.arg2; + if (CanBeExtended(dest_submatrix_index, src_submatrix_index)) { + Extend(&command.arg1, &command.arg2); + changed = true; + } + } + } + if (changed) + FixComputation(); +} + +void MatrixExtender::FixComputation() { + // make sure that allocation and deallocation commands + // operate on whole matrix. + std::vector::iterator + command_iter = computation_->commands.begin(), + command_end = computation_->commands.end(); + std::vector whole_submatrices; + computation_->GetWholeSubmatrices(&whole_submatrices); + for (; command_iter != command_end; ++command_iter) { + NnetComputation::Command &command = *command_iter; + if (command.command_type == kAllocMatrix || + command.command_type == kDeallocMatrix) { + int32 s = command.arg1, + m = computation_->submatrices[s].matrix_index, + new_s = whole_submatrices[m]; + if (new_s != s) { + KALDI_ASSERT(orig_num_rows_[m] != computation_->matrices[m].num_rows); + command.arg1 = new_s; + } + } + } + RenumberComputation(computation_); +} + + +void ExtendMatrices(NnetComputation *computation) { + MatrixExtender ext(computation); + ext.ExtendMatrices(); +} + + + /** This class is responsible for consolidating the model-update part of backprop commands, for components in (e.g.) recurrent networks that need to have many separate backprop commands, into more efficient single commands @@ -2553,7 +2757,8 @@ static void ConvertNumNValues(int32 n_stride, int32 old_N, int32 new_N, // This class implements the internals of the ExpandComputation() function (used // in shortcut compilation); see comment by the declaration of -// ExpandComputation() in nnet-optimize-utils.h for overview. +// ExpandComputation() in nnet-optimize-utils.h for overview. (It relates to +// shortcut compilation). class ComputationExpander { public: ComputationExpander(const Nnet &nnet, @@ -4062,7 +4267,7 @@ void InsertCommands( num_old_commands = computation->commands.size(); if (num_new_commands == 0) return; - CommandPairOperator comparison_operator; + CommandPairComparator comparison_operator; // use std::stable_sort so that for entries in 'new_commands' that // have the same .first value, they stay in the same order they were // in before sorting. @@ -4174,14 +4379,13 @@ class MemoryCompressionOptimizer { const Nnet &nnet_; int32 memory_compression_level_; + int32 middle_command_; NnetComputation *computation_; Analyzer analyzer_; }; void MemoryCompressionOptimizer::ModifyComputation() { - int32 cur_num_commands = computation_->commands.size(); - // whole_submatrices[m] is the submatrix-index of the submatrix that // represents the whole of matrix m. std::vector whole_submatrices; @@ -4221,14 +4425,21 @@ void MemoryCompressionOptimizer::Optimize() { for (int32 m = 1; m < num_matrices; m++) ProcessMatrix(m); if (!compress_info_.empty()) - ModifyComputatin(); + ModifyComputation(); } void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { + if (analyzer_.matrix_accesses[m].is_output) { + return; // We can't do this optimization for matrices that are going to be + // output to the user. + } + // 'accesses' list the commands that access this matrix. const std::vector &accesses = analyzer_.matrix_accesses[m].accesses; - Access middle_access; - middle_access.command_index = middle_command_; + // the 'kReadAccess' below is actually a don't-care This is just + // to find the position in 'accesses' that corresponds to command-index + // 'middle_command'. + Access middle_access(middle_command_, kReadAccess); std::vector::const_iterator iter = std::lower_bound(accesses.begin(), accesses.end(), middle_access); @@ -4252,12 +4463,14 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { &forward_access = iter[-1]; KALDI_ASSERT(forward_access.command_index < middle_command_ && backward_access.command_index > middle_command_); + // 'backward_access_is_last_access' is going to be set to true if // 'backward_access' is the last command to access the matrix (apart from // deallocation commands). bool backward_access_is_last_access = false; - if (accesses.end() - backward_access <= 2) { - // if there is at most 1 command after 'backward_access'... + if (accesses.end() - iter <= 2) { + // if there is at most 1 command after 'backward_access' that accesses this + // matrix... const Access &next_access = iter[1]; NnetComputation::Command &next_command = computation_->commands[next_access.command_index]; @@ -4268,21 +4481,17 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { int32 backward_command_index = backward_access.command_index, forward_command_index = forward_access.command_index; NnetComputation::Command - &forward_command = computation_->commands[forward_command_index], &backward_command = computation_->commands[backward_command_index]; if (memory_compression_level_ >= 1 && backward_access_is_last_access && - forward_access.access_type == kWriteAccess && backward_access.access_type == kReadAccess && - forward_command.command_type == kPropagate && backward_command.command_type == kBackprop) { - int32 component_index = backward_access.arg1; + int32 component_index = backward_command.arg1; const Component *component = nnet_.GetComponent(component_index); // this is potentially a candidate for our optimization for ReLU units, - // where we only store the sign. - if (component->Type() == "RectifiedLinearComponent" && - component_index == forward_access.arg1) { + // where we only need to store the sign. + if (component->Type() == "RectifiedLinearComponent") { compress_info_.push_back( MatrixCompressInfo(m, forward_command_index, backward_command_index, @@ -4291,9 +4500,21 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { } } - // TODO: we can later implement compression for other cases. - // + // If memory_compression_level >= 2 (an "intermediate" level of compression), + // then we'll consider compressing quantities using 16 bits in the range + // [-10, 10]. Because of the way this compression works, exact zero will + // still be uncompressed as exact zero, so even if this is the output + // of a ReLU, it's OK. (Having a few derivatives zero for ReLU outputs + // that were very close to zero is OK.) + if (memory_compression_level_ >= 2) { + compress_info_.push_back( + MatrixCompressInfo(m, forward_command_index, + backward_command_index, + kCompressedMatrixInt16, 10.0)); + return; + } + // TODO: later maybe implement something for memory compression level = 3. } @@ -4318,10 +4539,11 @@ void OptimizeMemoryCompression(const Nnet &nnet, if (middle_command < 0) { middle_command = static_cast(i); } else { - KALDI_WARN << "Found more than one command of tyep kNoOperationMarker " + KALDI_WARN << "Found more than one command of type kNoOperationMarker " "in non-looped computation."; // there are more than one command of this type... this wasn't expected. - return false; + // return (i.e. do nothing). + return; } } } diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 1ffcc330adf..93f3cdb128f 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -181,6 +181,16 @@ class VariableMergingOptimizer { bool already_called_merge_variables_; }; +/** + This is not really an optimization in itself but it can make things easier + for class VariableMergingOptimizer (usually called by its wrapper + VariableMergingOptimization()). It looks for a case where most of a matrix + (but not its final rows) are copied to some submatrix of another matrix, + where the row-range of that submatrix extends to the last row of the other + matrix; and it extends the other matrix with additional rows so that the + entire source matrix can be copied to the destination. + */ +void ExtendMatrices(NnetComputation *computation); /** diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 3dff8c0a4f3..872624eaa7e 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -515,6 +515,14 @@ void Optimize(const NnetOptimizeOptions &config, CheckComputation(nnet, *computation, true); } + if (config.optimize && config.extend_matrices && + !config.optimize_looped_computation) { + ExtendMatrices(computation); + if (GetVerboseLevel() >= 3) + CheckComputation(nnet, *computation, false); + } + + if (config.optimize && (config.remove_assignments || config.backprop_in_place || config.propagate_in_place)) { @@ -582,7 +590,7 @@ void Optimize(const NnetOptimizeOptions &config, if (config.memory_compression_level > 0 && !config.optimize_looped_computation) { - OptimizeMemoryCompression(nnet_, config.memory_compression_level, + OptimizeMemoryCompression(nnet, config.memory_compression_level, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index d3ecf01847a..ba8efce0fe3 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -39,6 +39,7 @@ struct NnetOptimizeOptions { bool propagate_in_place; bool backprop_in_place; bool optimize_row_ops; + bool extend_matrices; bool convert_addition; bool remove_assignments; bool allow_left_merge; @@ -62,6 +63,7 @@ struct NnetOptimizeOptions { propagate_in_place(true), backprop_in_place(true), optimize_row_ops(true), + extend_matrices(true), convert_addition(true), remove_assignments(true), allow_left_merge(true), @@ -87,6 +89,9 @@ struct NnetOptimizeOptions { "disable optimization that allows in-place propagation"); opts->Register("backprop-in-place", &backprop_in_place, "Set to false to " "disable optimization that allows in-place backprop"); + opts->Register("extend-matrices", &extend_matrices, "This optimization " + "can reduce memory requirements for TDNNs when applied " + "together with --convert-addition=true"); opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to " "disable certain optimizations that act on operations of " "type *Row*."); From 0269f3634f5aa2f4166b680e86815fee78eaed04 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 27 Jan 2018 15:14:55 -0500 Subject: [PATCH 091/184] [src] Add cu-compressed-matrix.cc --- src/cudamatrix/Makefile | 3 +- src/cudamatrix/cu-compressed-matrix.cc | 112 +++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 src/cudamatrix/cu-compressed-matrix.cc diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index e6ade23728f..a57685eee06 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -11,7 +11,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \ - cu-sparse-matrix.o cu-allocator.o cu-array.o + cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o ifeq ($(CUDA), true) OBJFILES += cu-kernels.o endif @@ -33,4 +33,3 @@ endif $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ include ../makefiles/default_rules.mk - diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc new file mode 100644 index 00000000000..4ca52817271 --- /dev/null +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -0,0 +1,112 @@ +// cudamatrix/cu-compressed-matrix.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#if HAVE_CUDA == 1 +#include +#include +#endif + +#include "base/timer.h" +#include "cudamatrix/cu-common.h" +#include "cudamatrix/cu-vector.h" +#include "cudamatrix/cu-device.h" +#include "cudamatrix/cu-kernels.h" +#include "cudamatrix/cu-array.h" +#include "cudamatrix/cu-compressed-matrix.h" + +namespace kaldi { + + +CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, + BaseFloat range) { + if (t == kCompressedMatrixUint8) { + KALDI_ASSERT(range >= 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixInt16) { + KALDI_ASSERT(range > 0); + return new CuCompressedMatrix(range); + } +} + +template CuCompressedMatrix::CuCompressedMatrix(BaseFloat range): + data_(NULL), scale_(range / std::numeric_limits::max()), + num_rows_(0), num_cols_(0), stride_(0) { +#if HAVE_CUDA == 1 + KALDI_ASSERT(CuDevice::Instantiate().Enabled()); +#endif + KALDI_ERR << "You instantiated CuCompressedMatrix while GPU use " + "was not compiled in."; +} + + +template void CuCompressedMatrix::Destroy() { +#if HAVE_CUDA == 1 + if (data_ != NULL) { + CuTimer tim; + CuDevice::Instantiate().Free(data_); + data_ = NULL; + num_rows_ = 0; + num_cols_ = 0; + stride_ = 0; + CuDevice::Instantiate().AccuProfile(__func__, tim); + } +#endif +} + +template void CuCompressedMatrix::CopyFromMat( + CuMatrixBase &mat) { +#if HAVE_CUDA == 1 + KALDI_ASSERT(CuDevice::Instantiate().Enabled()); + Destroy(); + if (mat.NumRows() == 0) + return; + num_rows_ = mat.NumRows(); + num_cols_ = mat.NumCold(); + stride_ = num_cols_; + + CuTimer tim; + data_ = CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_); + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + + if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others. + cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(), + data_, stride_); + } else { + cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(), + data_, stride_, 1.0 / scale_); + + } + + CU_SAFE_CALL(cudaGetLastError()); + + + + CuDevice::Instantiate().AccuProfile(CuCompressedMatrix::CopyFromMat(malloc), + tim); + + +#endif +} + + +} // namespace kaldi From 5aa698aab24b93b4bae211391e7543d74ab9e1ac Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 27 Jan 2018 19:29:52 -0500 Subject: [PATCH 092/184] [src] Finish and test CuCompressedMatrix code. --- src/cudamatrix/Makefile | 2 +- src/cudamatrix/cu-compressed-matrix-test.cc | 179 ++++++++++++++++++++ src/cudamatrix/cu-compressed-matrix.cc | 116 ++++++++----- src/cudamatrix/cu-compressed-matrix.h | 39 +++-- src/cudamatrix/cu-kernels-ansi.h | 39 ++++- src/cudamatrix/cu-kernels.cu | 157 ++++++++++++++--- src/cudamatrix/cu-kernels.h | 53 +++++- 7 files changed, 495 insertions(+), 90 deletions(-) create mode 100644 src/cudamatrix/cu-compressed-matrix-test.cc diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index a57685eee06..ca831390ea9 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -7,7 +7,7 @@ LDLIBS += $(CUDA_LDLIBS) TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \ cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \ - cu-sparse-matrix-test cu-device-test cu-rand-speed-test + cu-sparse-matrix-test cu-device-test cu-rand-speed-test cu-compressed-matrix-test OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \ diff --git a/src/cudamatrix/cu-compressed-matrix-test.cc b/src/cudamatrix/cu-compressed-matrix-test.cc new file mode 100644 index 00000000000..3cbd7bd5060 --- /dev/null +++ b/src/cudamatrix/cu-compressed-matrix-test.cc @@ -0,0 +1,179 @@ +// cudamatrix/cu-compressed-matrix-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "cudamatrix/cu-matrix-lib.h" + +using namespace kaldi; + + +namespace kaldi { + +void CuCompressedMatrixTestSign() { + int32 num_rows = RandInt(80, 100), + num_cols = RandInt(80, 100); + CuMatrix M(num_rows, num_cols); + M.SetRandn(); + + CuMatrix M2(num_rows, num_cols, kUndefined); + + CuCompressedMatrixBase *cm = NewCuCompressedMatrix(kCompressedMatrixUint8, 0.0); + + // this just stores (M(i, j) > 0 ? 1 : 0). + cm->CopyFromMat(M); + cm->CopyToMat(&M2); + + M.Heaviside(M); + + AssertEqual(M, M2); + delete cm; +} + +void CuCompressedMatrixTestNonnegative() { + int32 num_rows = RandInt(80, 100), + num_cols = RandInt(80, 100); + CuMatrix M(num_rows, num_cols); + M.SetRandUniform(); + + BaseFloat range = 0.5 * RandInt(1, 5); + M.Scale(range); + + CuCompressedMatrixType t = (RandInt(0, 1) == 0 ? + kCompressedMatrixUint8 : + kCompressedMatrixUint16); + + // since the input is in the correct range, truncating or not should make no + // difference. + bool truncate = (RandInt(0, 1) == 0); + + BaseFloat extra_error = 0.0; + if (truncate && (RandInt(0, 1) == 0)) { + // this tests that with truncate == true, adding a small offset, which would + // take us outside the representable range, will not add too much extra + // error. (with truncate == false this would not be true because we wouldn't + // round to the edges of the range, it would wrap around). + extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0); + M.Add(extra_error); + } + + CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate); + + CuMatrix M2(num_rows, num_cols, kUndefined); + + cm->CopyFromMat(M); + cm->CopyToMat(&M2); + + + M2.AddMat(-1.0, M); + + BaseFloat diff_max = M2.Max(), + diff_min = M2.Min(); + + BaseFloat + headroom = 1.1, + max_expected_error = fabs(extra_error) + headroom * 0.5 * + range / (t == kCompressedMatrixUint8 ? 255 : 65535); + + KALDI_ASSERT(diff_max < max_expected_error && + diff_min > -1.0 * max_expected_error); + + delete cm; +} + +// this is like CuCompressedMatrixTestNonnegative but +// with signed integers, and input in the range [-range, +range]. +void CuCompressedMatrixTestSymmetric() { + int32 num_rows = RandInt(80, 100), + num_cols = RandInt(80, 100); + CuMatrix M(num_rows, num_cols); + M.SetRandUniform(); + M.Scale(2.0); + M.Add(-1.0); + + BaseFloat range = 0.5 * RandInt(1, 5); + M.Scale(range); + + CuCompressedMatrixType t = (RandInt(0, 1) == 0 ? + kCompressedMatrixInt8 : + kCompressedMatrixInt16); + + // since the input is in the correct range, truncating or not should make no + // difference. + bool truncate = (RandInt(0, 1) == 0); + + BaseFloat extra_error = 0.0; + if (truncate && (RandInt(0, 1) == 0)) { + // this tests that with truncate == true, adding a small offset, which would + // take us outside the representable range, will not add too much extra + // error. (with truncate == false this would not be true because we wouldn't + // round to the edges of the range, it would wrap around). + extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0); + M.Add(extra_error); + } + + CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate); + + CuMatrix M2(num_rows, num_cols, kUndefined); + + cm->CopyFromMat(M); + cm->CopyToMat(&M2); + + + M2.AddMat(-1.0, M); + + BaseFloat diff_max = M2.Max(), + diff_min = M2.Min(); + + BaseFloat + headroom = 1.1, + max_expected_error = fabs(extra_error) + headroom * 0.5 * + range / (t == kCompressedMatrixInt8 ? 127 : 32767); + + KALDI_ASSERT(diff_max < max_expected_error && + diff_min > -1.0 * max_expected_error); + + delete cm; +} + + + +} // namespace kaldi + + +int main() { + SetVerboseLevel(1); + // we don't run this test if CUDA is not compiled in, since + // you can't instantiate class CuCompressedMatrix in that case. +#if HAVE_CUDA == 1 + CuDevice::Instantiate().SelectGpuId("yes"); + for (int32 i = 1; i < 10; i++) { + CuCompressedMatrixTestSign(); + CuCompressedMatrixTestNonnegative(); + CuCompressedMatrixTestSymmetric(); + } + +#endif + return 0; +} diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index 4ca52817271..be02921169d 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -34,79 +34,109 @@ namespace kaldi { -CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, - BaseFloat range) { - if (t == kCompressedMatrixUint8) { - KALDI_ASSERT(range >= 0); - return new CuCompressedMatrix(range); - } else if (t == kCompressedMatrixInt16) { - KALDI_ASSERT(range > 0); - return new CuCompressedMatrix(range); - } -} - -template CuCompressedMatrix::CuCompressedMatrix(BaseFloat range): +template +CuCompressedMatrix::CuCompressedMatrix(BaseFloat range, bool truncate): data_(NULL), scale_(range / std::numeric_limits::max()), - num_rows_(0), num_cols_(0), stride_(0) { + truncate_(truncate), num_rows_(0), num_cols_(0), stride_(0) { #if HAVE_CUDA == 1 KALDI_ASSERT(CuDevice::Instantiate().Enabled()); -#endif +#else KALDI_ERR << "You instantiated CuCompressedMatrix while GPU use " "was not compiled in."; +#endif } - -template void CuCompressedMatrix::Destroy() { +template +void CuCompressedMatrix::Destroy() { #if HAVE_CUDA == 1 if (data_ != NULL) { - CuTimer tim; + // we don't bother timing this because Free() won't normally have to + // access the GPU at all (due to caching). CuDevice::Instantiate().Free(data_); data_ = NULL; num_rows_ = 0; num_cols_ = 0; stride_ = 0; - CuDevice::Instantiate().AccuProfile(__func__, tim); } #endif } -template void CuCompressedMatrix::CopyFromMat( - CuMatrixBase &mat) { +template +void CuCompressedMatrix::CopyFromMat( + const CuMatrixBase &mat) { #if HAVE_CUDA == 1 KALDI_ASSERT(CuDevice::Instantiate().Enabled()); - Destroy(); if (mat.NumRows() == 0) return; - num_rows_ = mat.NumRows(); - num_cols_ = mat.NumCold(); - stride_ = num_cols_; - - CuTimer tim; - data_ = CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_); - - dim3 dimGrid, dimBlock; - GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), - &dimGrid, &dimBlock); - - if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others. - cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(), - data_, stride_); - } else { - cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(), - data_, stride_, 1.0 / scale_); - + if (num_rows_ != mat.NumRows() || num_cols_ != mat.NumCols()) { + Destroy(); + num_rows_ = mat.NumRows(); + num_cols_ = mat.NumCols(); + data_ = static_cast( + CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_)); + stride_ = num_cols_; } + { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + + if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others. + cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(), + data_, stride_); + } else { + cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(), + data_, stride_, float(1.0 / scale_), + truncate_); + } CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } +#endif +} - - CuDevice::Instantiate().AccuProfile(CuCompressedMatrix::CopyFromMat(malloc), - tim); +template +void CuCompressedMatrix::CopyToMat(CuMatrixBase *mat) const { +#if HAVE_CUDA == 1 + KALDI_ASSERT(CuDevice::Instantiate().Enabled()); + KALDI_ASSERT(mat->NumRows() == num_rows_ && mat->NumCols() == num_cols_); + { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + BaseFloat scale = (scale_ == 0.0 ? 1.0 : scale_); + cuda_mat_uncompress(dimGrid, dimBlock, mat->Data(), mat->Dim(), + data_, stride_, float(scale)); + } +#endif +} -#endif +CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, + BaseFloat range, + bool truncat) { + if (t == kCompressedMatrixUint8) { + KALDI_ASSERT(range >= 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixInt8) { + KALDI_ASSERT(range >= 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixUint16) { + KALDI_ASSERT(range > 0); + return new CuCompressedMatrix(range); + } else if (t == kCompressedMatrixInt16) { + KALDI_ASSERT(range > 0); + return new CuCompressedMatrix(range); + } else { + KALDI_ERR << "Unknown compressed-matrix type"; + return NULL; + } } + } // namespace kaldi diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h index 0be1bb391fb..2eafc20c6cc 100644 --- a/src/cudamatrix/cu-compressed-matrix.h +++ b/src/cudamatrix/cu-compressed-matrix.h @@ -38,20 +38,20 @@ class CuCompressedMatrixBase { /// Sets *this to an appropriately compressed copy of 'mat', which /// includes resizing *this. The details of how this is done will be /// different in different child classes. - virtual void CopyFromMat(CuMatrixBase &mat) = 0; + virtual void CopyFromMat(const CuMatrixBase &mat) = 0; /// Copies the contents of *this to 'mat', which should be /// correctly sized beforehand. - virtual void CopyToMat(CuMatrixBase *mat) = 0; + virtual void CopyToMat(CuMatrixBase *mat) const = 0; // The number of rows in *this. - virtual int32 NumRows() = 0; + virtual int32 NumRows() const = 0; // The number of columns in *this. - virtual int32 NumCols() = 0; + virtual int32 NumCols() const = 0; - ~CuCompressedMatrixBase() { } + virtual ~CuCompressedMatrixBase() { } }; @@ -78,18 +78,25 @@ class CuCompressedMatrix: public CuCompressedMatrixBase { /// range = 0 (only supported for I == int8) is a special case in which only /// the sign of the input is retained; and when we reconstruct, the output /// will be -1, 0 or 1. - CuCompressedMatrix(BaseFloat range); + /// + /// truncate (only relevant if range != 0) should be true if it's possible + /// that the input could exceed the allowed input range, i.e. [0, range] if I + /// is unsigned, and [-range, range] if I is signed; and it may be false if + /// you know that the input (the matrix given to CopyFromMat) will have + /// elements only in the allowed range. Setting 'truncate' to false + /// allows the compression code to avoid the bounds check. + CuCompressedMatrix(BaseFloat range, bool truncate = true); - virtual void CopyFromMat(CuMatrixBase &mat); + virtual void CopyFromMat(const CuMatrixBase &mat); - virtual void CopyToMat(CuMatrixBase *mat); + virtual void CopyToMat(CuMatrixBase *mat) const; - virtual MatrixIndexT NumRows() { return num_rows_; } + virtual MatrixIndexT NumRows() const { return num_rows_; } - virtual MatrixIndexT NumCols() { return num_cols_; } + virtual MatrixIndexT NumCols() const { return num_cols_; } - ~CuCompressedMatrix(); + virtual ~CuCompressedMatrix() { Destroy(); } private: // If there was data in 'data_', frees it, and sets it to NULL. @@ -109,6 +116,8 @@ class CuCompressedMatrix: public CuCompressedMatrixBase { // that the output becomes -1, 0 and 1. BaseFloat scale_; + bool truncate_; + MatrixIndexT num_rows_; MatrixIndexT num_cols_; // stride_ is currently always equal to num_cols_; it was added mainly to @@ -130,13 +139,15 @@ enum CuCompressedMatrixType { /** This function allocates a new CuCompressedMatrix with type determined - by t, and with the 'range' parameter provided (range must be >= 0, - 0 as a special case). + by t, and with the 'range' and 'truncate' parameters provided to the + constructor of class CuCompressedMatrix. + It will crash at runtime if called when CUDA is not compiled in, or not enabled. */ CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t, - BaseFloat range); + BaseFloat range, + bool truncate = true); } // namespace kaldi diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 8a95ca09537..8ab03c7e14e 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -746,11 +746,40 @@ void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc); -void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src, - MatrixDim dim, int16_t *dest, - int dest_stride, double inv_scale); -void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim, - unsigned char *dest, int dest_stride); +void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); +void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); +void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); +void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check); + +void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, int dest_stride); + +void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int16_t *src, + int src_stride, float scale); +void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint16_t *src, + int src_stride, float scale); +void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int8_t *src, + int src_stride, float scale); +void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint8_t *src, + int src_stride, float scale); + + } // extern "C" diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index f62a07d8917..b0468b7fa7c 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -3560,44 +3560,102 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m __global__ -static void _cuda_compress_int8_sign(const BaseFloat *src, MatrixDim dim, - unsigned char *dest, int dest_stride) { +static void _cuda_compress_uint8_sign(const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int dest_index = i + j * dest_stride, src_index = i + j * dim.stride; - if (i < d.cols && j < d.rows) { + if (i < dim.cols && j < dim.rows) { BaseFloat f = src[src_index]; dest[dest_index] = (f > 0.0 ? (unsigned char)1 : (unsigned char)0); } } -// this version of the function will only be used if BaseFloat is double. +// The following inline templated functions are a workaround for the +// fact that (I believe) std::numeric_limits is not available in CUDA; +// they allow us to access the minimum and maximum elements of certain +// types from templated code. +template __device__ static inline int minimum_integer_value(); +template __device__ static inline int maximum_integer_value(); + +template<> __device__ int maximum_integer_value() { return 127; } +template<> __device__ int minimum_integer_value() { return -128; } +template<> __device__ int maximum_integer_value() { return 255; } +template<> __device__ int minimum_integer_value() { return 0; } +template<> __device__ int maximum_integer_value() { return 32767; } +template<> __device__ int minimum_integer_value() { return -32768; } +template<> __device__ int maximum_integer_value() { return 65535; } +template<> __device__ int minimum_integer_value() { return 0; } + + + +template __global__ -static void _cuda_compress_double_to_int16(const double *src, MatrixDim dim, - int16_t *dest, int dest_stride, - double inv_scale) { +static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim, + I *dest, int dest_stride, float inv_scale) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int dest_index = i + j * dest_stride, src_index = i + j * dim.stride; - int ok = (i < d.cols && j < d.rows); + const int min_value = minimum_integer_value(), + max_value = maximum_integer_value(); + int16_t compressed_value; + int ok = (i < dim.cols && j < dim.rows); if (ok) { - BaseFloat f = src[src_index]; - int i = __double2int_rn(f * inv_scale); + float f = src[src_index]; + // note: I'm not sure what __float2int_rn does if input is outside of + // integer range, but it doesn't matter much as in the situations where this + // type of compression would make sense, the input should be well inside the + // range of 'int', and if it fails, we've probably already catastrophically + // diverged. + int i = __float2int_rn(f * inv_scale); // note: SignedInt will be int8 or (more likely) int16. - int16_t s; - if (i < -32768) s = -32768; - else if (i > 32767) s = 32767; - else s = i; + if (i < min_value) compressed_value = min_value; + else if (i > max_value) compressed_value = max_value; + else compressed_value = i; } __syncthreads(); if (ok) { + dest[dest_index] = compressed_value; + } +} + + +template +__global__ +static void _cuda_compress_no_bounds_check(const BaseFloat *src, MatrixDim dim, + I *dest, int dest_stride, + float inv_scale) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dest_index = i + j * dest_stride, + src_index = i + j * dim.stride; + if (i < dim.cols && j < dim.rows) { + float f = src[src_index]; + int i = __float2int_rn(f * inv_scale); + I s = i; dest[dest_index] = s; } } +template +__global__ +static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim, + const I *src, int src_stride, + float scale) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int src_index = i + j * src_stride, + dest_index = i + j * dim.stride; + if (i < dim.cols && j < dim.rows) { + I s = src[src_index]; + dest[dest_index] = float(s * scale); + } +} + + /*********************************************************************** * ANSI-C wrappers of CUDA kernels @@ -5261,14 +5319,69 @@ void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim, _apply_exp_special<<>>(out, out_dim, in, in_stride); } -void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim, - unsigned char *dest, int dest_stride) { - _cuda_compress_int8_sign<<>>(src, dim, dest, dest_stride); +void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim, + unsigned char *dest, int dest_stride) { + _cuda_compress_uint8_sign<<>>(src, dim, dest, dest_stride); +} + +void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} +void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} +void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } +} +void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + if (bounds_check) { + _cuda_compress_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } else { + _cuda_compress_no_bounds_check<<>>(src, dim, dest, dest_stride, inv_scale); + } } -void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src, - MatrixDim dim, int16_t *dest, - int dest_stride, double inv_scale) { - _cuda_compress_double_to_int16<<>>(src, dim, dest, dest_stride, - inv_scale); +void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint8_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} +void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int8_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} +void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint16_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); +} +void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int16_t *src, + int src_stride, float scale) { + _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index dba1a0516a3..3518e0c71ed 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1466,9 +1466,9 @@ inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, // Compresses the matrix in 'src' to 'dest', retaining only zero-one // information (1 if the value is >0, 0 otherwise) inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, - MatrixDim dim, unsigned char *dest, + MatrixDim dim, uint8 *dest, int dest_stride) { - cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride); + cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride); } // this template handles the other types that are not instantiated yet, // to avoid compilation errors. @@ -1480,11 +1480,54 @@ inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, } inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, - MatrixDim dim, unsigned char *dest, - int dest_stride, BaseFloat inv_scale) { - cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride, inv_scale); + MatrixDim dim, int16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint16_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, uint8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); +} +inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src, + MatrixDim dim, int8_t *dest, + int dest_stride, float inv_scale, + bool bounds_check) { + cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride, + inv_scale, bounds_check); } +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int8_t *src, + int src_stride, float scale) { + cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale); +} +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint8_t *src, + int src_stride, float scale) { + cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale); +} +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const int16_t *src, + int src_stride, float scale) { + cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale); +} +inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest, + MatrixDim dim, const uint16_t *src, + int src_stride, float scale) { + cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale); +} } // namespace kaldi From 5dbfe97fe4a45fd0b4d753974393a38cf39a63f8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 00:44:42 -0500 Subject: [PATCH 093/184] [src] Various bug fixeS --- src/nnet3/nnet-analyze.cc | 41 ++++++++++++- src/nnet3/nnet-analyze.h | 1 + src/nnet3/nnet-common.cc | 30 +++++++--- src/nnet3/nnet-computation.cc | 19 +++++++ src/nnet3/nnet-computation.h | 4 ++ src/nnet3/nnet-compute.cc | 14 ++++- src/nnet3/nnet-compute.h | 2 +- src/nnet3/nnet-optimize-utils.cc | 98 +++++++++++++++++++++++++------- src/nnet3/nnet-optimize.cc | 39 +++++++------ src/nnet3/nnet-optimize.h | 3 - 10 files changed, 199 insertions(+), 52 deletions(-) diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index 551b50ff6ad..cd49e22b451 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -238,6 +238,23 @@ std::string ComputationVariables::DescribeVariable(int32 variable) const { return os.str(); } +NnetComputation::SubMatrixInfo ComputationVariables::VariableInfo( + int32 variable) const { + KALDI_ASSERT(variable >= 0 && variable < num_variables_); + int32 matrix_index = variable_to_matrix_[variable], + offset = variable - matrix_to_variable_index_[matrix_index], + num_column_variables = column_split_points_[matrix_index].size() - 1, + column_variable = offset % num_column_variables, + row_variable = offset / num_column_variables; + int32 row_offset = row_split_points_[matrix_index][row_variable], + num_rows = row_split_points_[matrix_index][row_variable+1] - row_offset, + col_offset = column_split_points_[matrix_index][column_variable], + num_cols = column_split_points_[matrix_index][column_variable+1] - + col_offset; + return NnetComputation::SubMatrixInfo(matrix_index, row_offset, num_rows, + col_offset, num_cols); +} + /// given a vector of pairs from computation.indexes_multi_indexes /// containing paris (submatrix-index, row-index), this function outputs @@ -622,6 +639,19 @@ void ComputationChecker::CheckComputationUndefined() const { const std::vector &accesses = a_.variable_accesses[v]; if (accesses.empty()) { if (config_.check_unused_variables) { + // Before we throw an error, we want to check that it isn't + // a case that can be produced by the ExtendMatrices() + // optimization, that is actually allowed. This is a case + // when a variable is the last few rows of a matrix, but + // not all columns of those last rows. + NnetComputation::SubMatrixInfo info = a_.variables.VariableInfo(v); + const NnetComputation::MatrixInfo &matrix_info = + computation_.matrices[info.matrix_index]; + if (info.row_offset > 0 && + info.num_rows + info.row_offset == matrix_info.num_rows && + !(info.col_offset == 0 && info.num_cols == matrix_info.num_cols)) { + continue; + } KALDI_ERR << "Variable " << v << " == " << a_.variables.DescribeVariable(v) << " is never used."; } @@ -728,9 +758,10 @@ void ComputationChecker::CheckComputationCompression() const { // alpha == 0.0 means we're only retaining the sign; we should // only do this if this is the output of a ReLU. // make sure there are only 2 commands after this: the uncompress - // command, a relu backprop command, and a deallocation command. + // command, and a relu backprop command. (Any deallocation + // command doesn't show up in the list of 'accesses'). KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 && - num_accesses == a + 4); + num_accesses == a + 3); // make sure the next access to that matrix, apart from the // uncompression command, is a ReLU propagation. int32 next_command_index = accesses.accesses[a+2].command_index; @@ -1004,14 +1035,18 @@ void ComputationChecker::CheckComputationIndexes() const { if (c.arg2 < static_cast(kCompressedMatrixInt8) || c.arg2 > static_cast(kCompressedMatrixUint16)) KALDI_ERR << "Invalid compressed-matrix type."; + if (c.arg3 != 0 && c.arg3 != 1) + KALDI_ERR << "Invalid 'truncate' option for compressing matrix."; if (c.alpha < 0.0 || c.alpha > 1000.0 || - (c.alpha == 0.0 && c.arg1 != kCompressedMatrixInt8)) + (c.alpha == 0.0 && c.arg2 != kCompressedMatrixUint8)) KALDI_ERR << "Invalid alpha in kCompressMatrix command."; + break; } case kUncompressMatrix: { if (c.arg1 < 1 || c.arg1 >= num_submatrices || !computation_.IsWholeMatrix(c.arg1)) KALDI_ERR << "submatrix index out of range or invalid"; + break; } case kAcceptInput: case kProvideOutput: { if (c.arg1 < 1 || c.arg1 >= num_submatrices || diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h index 2966cf947e4..2e1a9a33c0b 100644 --- a/src/nnet3/nnet-analyze.h +++ b/src/nnet3/nnet-analyze.h @@ -160,6 +160,7 @@ class ComputationVariables { // zero indexing): something like "m1" or "m1(0:99,:)" or "m1(0:19,10:49)" std::string DescribeVariable(int32 variable) const; + NnetComputation::SubMatrixInfo VariableInfo(int32 variable) const; private: // sets up split_points_, matrix_to_variable_index_, and num_variables_. // called from constructor. diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc index 75350d3d8f6..31ff9819dfa 100644 --- a/src/nnet3/nnet-common.cc +++ b/src/nnet3/nnet-common.cc @@ -440,6 +440,11 @@ void PrintIndexes(std::ostream &os, os << "[ ]"; return; } + // If the string is longer than 'max_string_length' characters, it will + // be summarized with '...' in the middle. + size_t max_string_length = 200; + std::ostringstream os_temp; + // range_starts will be the starts of ranges (with consecutive t values and // the same n value and zero x values) that we compactly print. we'll append // "end" to range_starts for convenience.n @@ -457,23 +462,32 @@ void PrintIndexes(std::ostream &os, } range_starts.push_back(cur_start); range_starts.push_back(end); - os << "["; + os_temp << "["; int32 num_ranges = range_starts.size() - 1; for (int32 r = 0; r < num_ranges; r++) { int32 range_start = range_starts[r], range_end = range_starts[r+1]; KALDI_ASSERT(range_end > range_start); - os << "(" << indexes[range_start].n << ","; + os_temp << "(" << indexes[range_start].n << ","; if (range_end == range_start + 1) - os << indexes[range_start].t; + os_temp << indexes[range_start].t; else - os << indexes[range_start].t << ":" << indexes[range_end - 1].t; + os_temp << indexes[range_start].t << ":" << indexes[range_end - 1].t; if (indexes[range_start].x != 0) - os << "," << indexes[range_start].x; - os << ")"; + os_temp << "," << indexes[range_start].x; + os_temp << ")"; if (r + 1 < num_ranges) - os << ", "; + os_temp << ", "; + } + os_temp << "]"; + + std::string str = os_temp.str(); + if (str.size() <= max_string_length) { + os << str; + } else { + size_t len = str.size(); + os << str.substr(0, max_string_length / 2) << " ... " + << str.substr(len - max_string_length / 2); } - os << "]"; } void PrintCindexes(std::ostream &ostream, diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc index 405faa56ede..40f9f2146a0 100644 --- a/src/nnet3/nnet-computation.cc +++ b/src/nnet3/nnet-computation.cc @@ -621,6 +621,25 @@ static void PrintCommand(std::ostream &os, os << "])\n"; break; } + case kCompressMatrix: { + BaseFloat range = c.alpha; + std::string truncate = (c.arg3 != 0 ? "true" : "false"); + std::string compressed_matrix_type; + if (c.arg2 == kCompressedMatrixInt8) { compressed_matrix_type = "int8"; } + else if (c.arg2 == kCompressedMatrixUint8) { compressed_matrix_type = "uint8"; } + else if (c.arg2 == kCompressedMatrixInt16) { compressed_matrix_type = "int16"; } + else { + KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16); + compressed_matrix_type = "uint16"; + } + os << "CompressMatrix(" << submatrix_strings[c.arg1] + << range << ", " << compressed_matrix_type << ", " + << truncate << ")\n"; + break; + } + case kUncompressMatrix: + os << "UncompressMatrix(" << submatrix_strings[c.arg1] << ")\n"; + break; case kAcceptInput: os << submatrix_strings[c.arg1] << " = user input [for node: '" << nnet.GetNodeName(c.arg2) << "']\n"; diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index 01c51e8e822..d077f9a69c6 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -237,6 +237,10 @@ struct ComputationRequest { compression type (it's converted from the enum CuCompressed MatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha determines the 'range' parameter (c.f. NewCuCompressedMatrix()). + arg3 will be converted to the 'truncate' argument to the + class CuCompressedMatrix; it should be false (0) if you know that + the input is limited to the allowed range, and true (1) if the + input may exceed that range (see docs for CuCompresedMatrix). - kUncompressMatrix: Uncompresses the matrix which is referred to by submatrix-index arg1 (it should previously have been compressed). - kAcceptInput: accepts a matrix of input from the user, which may be either diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index ad63043d851..f87b080fc43 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -391,8 +391,11 @@ void NnetComputer::ExecuteCommand() { int32 m = computation_.submatrices[c.arg1].matrix_index; KALDI_ASSERT(compressed_matrices_[m] == NULL && matrices_[m].NumRows() != 0); + BaseFloat range = c.alpha; + bool truncate = (c.arg3 != 0); compressed_matrices_[m] = NewCuCompressedMatrix( - static_cast(c.arg2), c.alpha); + static_cast(c.arg2), + range, truncate); compressed_matrices_[m]->CopyFromMat(matrices_[m]); matrices_[m].Resize(0, 0); } @@ -668,5 +671,14 @@ void NnetComputer::AcceptInputs(const Nnet &nnet, } } +NnetComputer::~NnetComputer() { + // Delete any pointers that are present in compressed_matrices_. Actually + // they should all already have been deallocated and set to NULL if the + // compuation was run to completion; we do this in case someone ran + // the forward propagation but not the backprop. + for (size_t i = 0; i < compressed_matrices_.size(); i++) + delete compressed_matrices_[i]; +} + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h index 19af856bad8..9f1860c656d 100644 --- a/src/nnet3/nnet-compute.h +++ b/src/nnet3/nnet-compute.h @@ -125,6 +125,7 @@ class NnetComputer { CuMatrix *output); + ~NnetComputer(); private: void Init(); // called from constructors. @@ -239,7 +240,6 @@ class NnetComputer { // memos are not reusable. inline void *GetMemo(int32 memo_index); - private: NnetComputer &operator = (const NnetComputer &other); // Disallow. }; diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 75521a43658..027781d64ad 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -1042,9 +1042,12 @@ class MatrixExtender { // This function modifies the computation to fix certain problems // that might have been introduced by Extend()... allocation, deallocation, - // void FixComputation(); + // This function modifies the computation to fix the debug info; if needed, + // it's called from FixComputation(). + void FixDebugInfo(); + // don't extend a destination matrix if it wasn't already // at least 'min_proportion' (80%) big enough to store the source. BaseFloat min_proportion_; @@ -1204,14 +1207,64 @@ void MatrixExtender::FixComputation() { m = computation_->submatrices[s].matrix_index, new_s = whole_submatrices[m]; if (new_s != s) { - KALDI_ASSERT(orig_num_rows_[m] != computation_->matrices[m].num_rows); + KALDI_ASSERT( + computation_->submatrices[s] == computation_->submatrices[new_s] || + orig_num_rows_[m] != computation_->matrices[m].num_rows); + command.arg1 = new_s; + } + } + if (command.command_type == kSetConst && command.alpha == 0.0) { + int32 s = command.arg1, + m = computation_->submatrices[s].matrix_index, + new_s = whole_submatrices[m]; + if (new_s != s) { + { + const NnetComputation::SubMatrixInfo &info = computation_->submatrices[ + command.arg1]; + const NnetComputation::MatrixInfo &mat_info = computation_->matrices[ + info.matrix_index]; + // If this command wasn't zeroing the the entirety of a matrix, + // (before we extended the matrix), we don't need to extend it. + if (!(info.row_offset == 0 && info.col_offset == 0 && + info.num_cols == mat_info.num_cols && + info.num_rows == orig_num_rows_[info.matrix_index])) + continue; + // I know doing this via 'continue' is odd, but it's done this way to + // avoid invalid iterators still being in scope; I think some runtimes + // check for it. + } command.arg1 = new_s; } } } + if (!computation_->matrix_debug_info.empty()) + FixDebugInfo(); RenumberComputation(computation_); } +void MatrixExtender::FixDebugInfo() { + int32 num_matrices = computation_->matrices.size(); + // matrix zero is not a 'real' matrix. + for (int32 m = 1; m < num_matrices; m++) { + NnetComputation::MatrixDebugInfo &debug_info = + computation_->matrix_debug_info[m]; + int32 new_num_rows = computation_->matrices[m].num_rows, + old_num_rows = debug_info.cindexes.size(); + if (new_num_rows != old_num_rows) { + debug_info.cindexes.resize(new_num_rows); + int32 num_extra_rows = new_num_rows - old_num_rows; + // the following should be true because min_proportion_ > 0.5. + KALDI_ASSERT(num_extra_rows <= old_num_rows); + for (int32 r = old_num_rows; r < new_num_rows; r++) { + Cindex cindex = debug_info.cindexes[r - num_extra_rows]; + // set the 't' value to kNoTime which indicates that it's not a 'real' + // time step, and may avoid errors in checking code. + cindex.second.t = kNoTime; + debug_info.cindexes[r] = cindex; + } + } + } +} void ExtendMatrices(NnetComputation *computation) { MatrixExtender ext(computation); @@ -3155,6 +3208,7 @@ void ComputationExpander::ComputeCommands() { case kAddRowRanges: ExpandRowRangesCommand(c, &c_out); break; + case kCompressMatrix: case kUncompressMatrix: case kAcceptInput: case kProvideOutput: case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker: case kNoOperationLabel: case kGotoLabel: @@ -4365,14 +4419,18 @@ class MemoryCompressionOptimizer { // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this // is useful for ReLUs. BaseFloat range; - + // this is provided to the initializer of CuCompressedMatrix; it should + // be true if the values being compressed are potentially outside of + // the representable range. + bool truncate; MatrixCompressInfo(int32 m, int32 forward_command_index, int32 backward_command_index, CuCompressedMatrixType compression_type, - BaseFloat range): + BaseFloat range, bool truncate): m(m), compression_command_index(forward_command_index), uncompression_command_index(backward_command_index), - compression_type(compression_type), range(range) { } + compression_type(compression_type), range(range), + truncate(truncate) { } }; std::vector compress_info_; @@ -4406,7 +4464,8 @@ void MemoryCompressionOptimizer::ModifyComputation() { std::pair p1( info.compression_command_index + 1, NnetComputation::Command(info.range, kCompressMatrix, - s, static_cast(info.compression_type))); + s, static_cast(info.compression_type), + info.truncate ? 1 : 0)); pairs_to_insert.push_back(p1); std::pair p2( info.uncompression_command_index, @@ -4443,6 +4502,11 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { std::vector::const_iterator iter = std::lower_bound(accesses.begin(), accesses.end(), middle_access); + + if (m == 84) { + KALDI_LOG << "m == 84"; //TEMP + } + // At this point, 'iter' points to the first access in 'accesses' // whose command index is >= 'middle_command_' (which separates the forward // and backward passes), or accesses.end() if this matrix was not @@ -4466,18 +4530,10 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { // 'backward_access_is_last_access' is going to be set to true if // 'backward_access' is the last command to access the matrix (apart from - // deallocation commands). - bool backward_access_is_last_access = false; - if (accesses.end() - iter <= 2) { - // if there is at most 1 command after 'backward_access' that accesses this - // matrix... - const Access &next_access = iter[1]; - NnetComputation::Command &next_command = - computation_->commands[next_access.command_index]; - if (next_command.command_type == kDeallocMatrix || - next_command.command_type == kSwapMatrix) - backward_access_is_last_access = true; - } + // deallocation or matrix-swap commands, which don't show up in the list of + // accesses). + bool backward_access_is_last_access = (accesses.end() == iter + 1); + int32 backward_command_index = backward_access.command_index, forward_command_index = forward_access.command_index; NnetComputation::Command @@ -4495,7 +4551,8 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { compress_info_.push_back( MatrixCompressInfo(m, forward_command_index, backward_command_index, - kCompressedMatrixUint8, 0.0)); + kCompressedMatrixUint8, 0.0, + true)); return; } } @@ -4510,7 +4567,8 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { compress_info_.push_back( MatrixCompressInfo(m, forward_command_index, backward_command_index, - kCompressedMatrixInt16, 10.0)); + kCompressedMatrixInt16, 10.0, + true)); return; } diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 872624eaa7e..a67f0090ef7 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -478,7 +478,7 @@ void Optimize(const NnetOptimizeOptions &config, const Nnet &nnet, int32 max_output_time_in_request, NnetComputation *computation) { - if (GetVerboseLevel() >= 3) { + if (GetVerboseLevel() >= 1) { // TEMP, should be 3 CheckComputation(nnet, *computation, true); KALDI_LOG << "Before optimization, max memory use (bytes) = " << GetMaxMemoryUse(*computation); @@ -515,6 +515,20 @@ void Optimize(const NnetOptimizeOptions &config, CheckComputation(nnet, *computation, true); } + + if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) { + bool must_renumber = false; + if (config.snip_row_ops && SnipRowOps(computation)) + must_renumber = true; + if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation)) + must_renumber = true; + if (must_renumber) { + RenumberComputation(computation); + if (GetVerboseLevel() >= 3) + CheckComputation(nnet, *computation, false); + } + } + if (config.optimize && config.extend_matrices && !config.optimize_looped_computation) { ExtendMatrices(computation); @@ -531,20 +545,6 @@ void Optimize(const NnetOptimizeOptions &config, CheckComputation(nnet, *computation, false); } - if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) { - bool must_renumber = false; - if (config.snip_row_ops && SnipRowOps(computation)) - must_renumber = true; - if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation)) - must_renumber = true; - if (must_renumber) { - RenumberComputation(computation); - if (GetVerboseLevel() >= 3) - CheckComputation(nnet, *computation, false); - } - } - - if (config.optimize && config.initialize_undefined) { RemoveUnnecessaryZeroing(nnet, computation); if (GetVerboseLevel() >= 3) @@ -594,13 +594,20 @@ void Optimize(const NnetOptimizeOptions &config, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); + + { // TEMP + std::ostringstream os; + computation->Print(os, nnet); + KALDI_LOG << "Compuation after adding memory compression is: " << os.str(); + } } - if (GetVerboseLevel() >= 3) { + if (GetVerboseLevel() >= 1) { // TEMP, should be 3 CheckComputation(nnet, *computation, false); KALDI_LOG << "After optimization, max memory use (bytes) = " << GetMaxMemoryUse(*computation); } + } // ComputationRequests are distinguished by the names and indexes diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index ba8efce0fe3..cf308dd3b00 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -131,9 +131,6 @@ struct NnetOptimizeOptions { opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to " "disable an optimization that reduces the size of certain " "per-row operations"); - opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to " - "disable an optimization that reduces the size of certain " - "per-row operations"); opts->Register("memory-compression-level", &memory_compression_level, "This is only relevant to training, not decoding. Set this " "to 0,1,2,3; higher levels are more aggressive at reducing " From 0f14373f479099f894062ba714bf607479170145 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 14:15:22 -0500 Subject: [PATCH 094/184] [src] Work around problem related to ungetc failures on ifstream (#2194) --- src/base/io-funcs.cc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc index 8b8662b6401..90988faf3ea 100644 --- a/src/base/io-funcs.cc +++ b/src/base/io-funcs.cc @@ -178,8 +178,14 @@ int PeekToken(std::istream &is, bool binary) { } int ans = is.peek(); if (read_bracket) { - if (!is.unget()) + if (!is.unget()) { KALDI_WARN << "Error ungetting '<' in PeekToken"; + // Clear the bad bit. It seems to be possible for this code to be + // reached, and the C++ standard is very vague on whether even a single + // call to unget() should succeed; see + // http://www.cplusplus.com/reference/istream/istream/unget/ + is.clear(); + } } return ans; } @@ -197,7 +203,12 @@ void ExpectToken(std::istream &is, bool binary, const char *token) { KALDI_ERR << "Failed to read token [started at file position " << pos_at_start << "], expected " << token; } - if (strcmp(str.c_str(), token) != 0) { + // The second half of the '&&' expression below is so that if we're expecting + // "", we will accept "Foo>" instead. This is so that the model-reading + // code will tolerate errors in PeekToken where is.unget() failed; search for + // is.clear() in PeekToken() for an explanation. + if (strcmp(str.c_str(), token) != 0 && + !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) { KALDI_ERR << "Expected token \"" << token << "\", got instead \"" << str <<"\"."; } From 6b62e0a0459fd6c1708dab4f193b9f6fa404bbc9 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 16:43:52 -0500 Subject: [PATCH 095/184] [src] Small fix to component reading, workaround for ungetc() issue. --- src/nnet3/nnet-simple-component.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 01adb222372..b3cf89ae6b4 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2725,8 +2725,8 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { } std::string token; ReadToken(is, binary, &token); - if (token != "" && - token != "") + // the following has to handle a couple variants of + if (token.find("NaturalGradientAffineComponent>") == std::string::npos) KALDI_ERR << "Expected or " << ", got " << token; } From 30e9a90d30d2007b30698a6351c9a36df1acf2ad Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 16:43:52 -0500 Subject: [PATCH 096/184] [src] Small fix to component reading, workaround for ungetc() issue. --- src/nnet3/nnet-simple-component.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 91906ac1ddf..c6d2c1f7952 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2854,8 +2854,8 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { } std::string token; ReadToken(is, binary, &token); - if (token != "" && - token != "") + // the following has to handle a couple variants of + if (token.find("NaturalGradientAffineComponent>") == std::string::npos) KALDI_ERR << "Expected or " << ", got " << token; SetNaturalGradientConfigs(); From b9fc15171319b5c0f0d0cfb16b4201524523adc9 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 19:29:52 -0500 Subject: [PATCH 097/184] [src] Some small cosmetic changes --- src/chain/chain-denominator.cc | 2 +- src/chain/chain-denominator.h | 3 +-- src/nnet3/nnet-utils.cc | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 2b27d4b9176..620ea873eb7 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -260,7 +260,7 @@ bool DenominatorComputation::Backward( BetaGeneralFrameDebug(t); Beta(t); if (t % kMaxDerivTimeSteps == 0) { - // commit the derivative stored in exp_nnet_output_transposed_ by adding + // commit the derivative stored in nnet_output_deriv_transposed_ by adding // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), frames_per_sequence_ - t), diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index a4a417c8a5d..f44588e434f 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -51,7 +51,7 @@ namespace chain { All this is done in parallel over multiple sequences, but the computations are independent over the separate sequences, so we won't introduce any notation - or index for the sequence; we'll just explain it for one sequences. + or index for the sequence; we'll just explain it for one sequence. Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and @@ -313,4 +313,3 @@ class DenominatorComputation { } // namespace kaldi #endif // KALDI_CHAIN_CHAIN_DENOMINATOR_H_ - diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index cc5762474d6..59885cf70b2 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -888,7 +888,7 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase *M) { if (GetVerboseLevel() >= 1) { BaseFloat error = P.FrobeniusNorm(); - KALDI_VLOG(1) << "Error in orthogonality is " << error; + KALDI_VLOG(2) << "Error in orthogonality is " << error; } // At this point, the matrix P contains what, in the math, would be Q = From 46cdd54f4b9fb6124acefb92258f9d3ce57d82de Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 19:32:57 -0500 Subject: [PATCH 098/184] [src] Various fixes w.r.t memory compression; cosmetic fixes too. --- src/nnet3/nnet-analyze.cc | 28 +++++++++++++++------------- src/nnet3/nnet-analyze.h | 2 +- src/nnet3/nnet-computation.cc | 26 +++++++++++++++++++------- src/nnet3/nnet-computation.h | 4 ++-- src/nnet3/nnet-compute.cc | 2 +- src/nnet3/nnet-compute.h | 2 +- src/nnet3/nnet-optimize-utils.cc | 9 ++------- src/nnet3/nnet-optimize-utils.h | 2 +- src/nnet3/nnet-optimize.cc | 10 ++-------- 9 files changed, 44 insertions(+), 41 deletions(-) diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index cd49e22b451..cf48d3d86c6 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -388,7 +388,7 @@ void ComputeCommandAttributes( vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr); break; } - case kUncompressMatrix: { + case kDecompressMatrix: { vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr); break; } @@ -656,7 +656,11 @@ void ComputationChecker::CheckComputationUndefined() const { << a_.variables.DescribeVariable(v) << " is never used."; } } else { - if (accesses[0].access_type != kWriteAccess) + // It's OK if part of a matrix is compressed, that is undefined; + // likely that part won't be referred to when we uncompress. + if (accesses[0].access_type != kWriteAccess && + !(computation_.commands[accesses[0].command_index].command_type == + kCompressMatrix)) KALDI_ERR << "Variable " << v << " == " << a_.variables.DescribeVariable(v) << " is read before it is written to"; @@ -738,7 +742,7 @@ void ComputationChecker::CheckComputationCompression() const { int32 command_index = access.command_index; const NnetComputation::Command &command = computation_.commands[command_index]; - if (command.command_type == kUncompressMatrix) { + if (command.command_type == kDecompressMatrix) { // check that the previous access to this matrix was a compression // command. KALDI_ASSERT( @@ -751,7 +755,7 @@ void ComputationChecker::CheckComputationCompression() const { // command. int32 next_command_index = accesses.accesses[a+1].command_index; KALDI_ASSERT(computation_.commands[next_command_index].command_type == - kUncompressMatrix && + kDecompressMatrix && command_index < middle_command && next_command_index > middle_command); if (command.alpha == 0.0) { @@ -1042,7 +1046,7 @@ void ComputationChecker::CheckComputationIndexes() const { KALDI_ERR << "Invalid alpha in kCompressMatrix command."; break; } - case kUncompressMatrix: { + case kDecompressMatrix: { if (c.arg1 < 1 || c.arg1 >= num_submatrices || !computation_.IsWholeMatrix(c.arg1)) KALDI_ERR << "submatrix index out of range or invalid"; @@ -1445,13 +1449,11 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { this_num_bytes = static_cast(sizeof(BaseFloat)) * submat_info.num_rows * submat_info.num_cols; - if (c.arg2 >= static_cast(kCompressedMatrixInt8) && - c.arg2 <= static_cast(kCompressedMatrixUint16)) { - this_compressed_num_bytes = - ((c.arg2 == static_cast(kCompressedMatrixInt8) || - c.arg2 == static_cast(kCompressedMatrixUint8)) ? - 1 : 2) * submat_info.num_rows * submat_info.num_cols; - } + this_compressed_num_bytes = + ((c.arg2 == static_cast(kCompressedMatrixInt8) || + c.arg2 == static_cast(kCompressedMatrixUint8)) ? + 1 : 2) * static_cast(submat_info.num_rows) * + submat_info.num_cols; } switch (c.command_type) { case kAllocMatrix: @@ -1464,7 +1466,7 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { case kCompressMatrix: cur_memory_use += this_compressed_num_bytes - this_num_bytes; break; - case kUncompressMatrix: + case kDecompressMatrix: cur_memory_use += this_num_bytes - this_compressed_num_bytes; break; default: diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h index 2e1a9a33c0b..77466039756 100644 --- a/src/nnet3/nnet-analyze.h +++ b/src/nnet3/nnet-analyze.h @@ -423,7 +423,7 @@ class ComputationChecker { void CheckComputationRewrite() const; // Check matrix accesses make sense. void CheckComputationMatrixAccesses() const; - // Some checks related to the kCompressMatrix and kUncompressMatrix commands. + // Some checks related to the kCompressMatrix and kDecompressMatrix commands. void CheckComputationCompression() const; // Check debug_info has the correct size, if used. void CheckComputationDebugInfo() const; diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc index 40f9f2146a0..a9a21bb3f24 100644 --- a/src/nnet3/nnet-computation.cc +++ b/src/nnet3/nnet-computation.cc @@ -284,8 +284,8 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) { command_type = kAddRowRanges; } else if (command_type_str == "kCompressMatrix") { command_type = kCompressMatrix; - } else if (command_type_str == "kUncompressMatrix") { - command_type = kUncompressMatrix; + } else if (command_type_str == "kDecompressMatrix") { + command_type = kDecompressMatrix; } else if (command_type_str == "kAcceptInput") { command_type = kAcceptInput; } else if (command_type_str == "kProvideOutput") { @@ -382,8 +382,8 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const { case kCompressMatrix: os << "kCompressMatrix\n"; break; - case kUncompressMatrix: - os << "kUncompressMatrix\n"; + case kDecompressMatrix: + os << "kDecompressMatrix\n"; break; case kAcceptInput: os << "kAcceptInput\n"; @@ -510,13 +510,17 @@ static void GetIndexesMultiStrings( // writes to "os" the statement for this command. -static void PrintCommand(std::ostream &os, +static void PrintCommand(std::ostream &os_out, const Nnet &nnet, const NnetComputation &computation, int32 command_index, const std::vector &submatrix_strings, const std::vector &indexes_strings, const std::vector &indexes_multi_strings) { + // If the string is longer than 'max_string_length' characters, it will + // be summarized with '...' in the middle. + size_t max_string_length = 200; + std::ostringstream os; KALDI_ASSERT(command_index < computation.commands.size()); os << "c" << command_index << ": "; const NnetComputation::Command &c = computation.commands[command_index]; @@ -637,8 +641,8 @@ static void PrintCommand(std::ostream &os, << truncate << ")\n"; break; } - case kUncompressMatrix: - os << "UncompressMatrix(" << submatrix_strings[c.arg1] << ")\n"; + case kDecompressMatrix: + os << "DecompressMatrix(" << submatrix_strings[c.arg1] << ")\n"; break; case kAcceptInput: os << submatrix_strings[c.arg1] << " = user input [for node: '" @@ -666,6 +670,14 @@ static void PrintCommand(std::ostream &os, default: KALDI_ERR << "Un-handled command type."; } + std::string str = os.str(); + if (str.size() <= max_string_length) { + os_out << str; + } else { + size_t len = str.size(); + os_out << str.substr(0, max_string_length / 2) << " ... " + << str.substr(len - max_string_length / 2); + } } diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index d077f9a69c6..0c6c690684a 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -241,7 +241,7 @@ struct ComputationRequest { class CuCompressedMatrix; it should be false (0) if you know that the input is limited to the allowed range, and true (1) if the input may exceed that range (see docs for CuCompresedMatrix). - - kUncompressMatrix: Uncompresses the matrix which is referred to + - kDecompressMatrix: Decompresses the matrix which is referred to by submatrix-index arg1 (it should previously have been compressed). - kAcceptInput: accepts a matrix of input from the user, which may be either features, or derivatives w.r.t. the output. arg1 is the submatrix index of @@ -274,7 +274,7 @@ enum CommandType { kPropagate, kBackprop, kBackpropNoModelUpdate, kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows, kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti, - kAddRowRanges, kCompressMatrix, kUncompressMatrix, + kAddRowRanges, kCompressMatrix, kDecompressMatrix, kAcceptInput, kProvideOutput, kNoOperation, kNoOperationPermanent, kNoOperationMarker, kNoOperationLabel, kGotoLabel }; diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index f87b080fc43..835b7fcfd88 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -401,7 +401,7 @@ void NnetComputer::ExecuteCommand() { } #endif } - case kUncompressMatrix: { + case kDecompressMatrix: { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { int32 m = computation_.submatrices[c.arg1].matrix_index; diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h index 9f1860c656d..333ed3168b9 100644 --- a/src/nnet3/nnet-compute.h +++ b/src/nnet3/nnet-compute.h @@ -164,7 +164,7 @@ class NnetComputer { // NULL). std::vector memos_; - // This is only used when commands kCompressMatrix and kUncompressMatrix are + // This is only used when commands kCompressMatrix and kDecompressMatrix are // invoked. It will be (the first time we compress a matrix) resized to be // the same size as 'matrices_' (i.e., indexed by matrix index). When we // compress a matrix m we set compressed_matrices_[m] to a non-NULL value and diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 027781d64ad..19ca31cf955 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -3208,7 +3208,7 @@ void ComputationExpander::ComputeCommands() { case kAddRowRanges: ExpandRowRangesCommand(c, &c_out); break; - case kCompressMatrix: case kUncompressMatrix: + case kCompressMatrix: case kDecompressMatrix: case kAcceptInput: case kProvideOutput: case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker: case kNoOperationLabel: case kGotoLabel: @@ -4469,7 +4469,7 @@ void MemoryCompressionOptimizer::ModifyComputation() { pairs_to_insert.push_back(p1); std::pair p2( info.uncompression_command_index, - NnetComputation::Command(1.0, kUncompressMatrix, s)); + NnetComputation::Command(1.0, kDecompressMatrix, s)); pairs_to_insert.push_back(p2); } InsertCommands(&pairs_to_insert, @@ -4502,11 +4502,6 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) { std::vector::const_iterator iter = std::lower_bound(accesses.begin(), accesses.end(), middle_access); - - if (m == 84) { - KALDI_LOG << "m == 84"; //TEMP - } - // At this point, 'iter' points to the first access in 'accesses' // whose command index is >= 'middle_command_' (which separates the forward // and backward passes), or accesses.end() if this matrix was not diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 93f3cdb128f..703f43af095 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -552,7 +552,7 @@ void InsertCommands( NnetComputation *computation); /// Performs optimization to reduce memory usage where possible, -/// making use of the kCompressMatrix and kUncompressMatrix commands. +/// making use of the kCompressMatrix and kDecompressMatrix commands. /// Should only be done after most other optimizations, because some /// optimizations (such as variable-merging) would not work correctly /// after doing this optimization. This does nothing for looped diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index a67f0090ef7..d614afce7d0 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -478,7 +478,7 @@ void Optimize(const NnetOptimizeOptions &config, const Nnet &nnet, int32 max_output_time_in_request, NnetComputation *computation) { - if (GetVerboseLevel() >= 1) { // TEMP, should be 3 + if (GetVerboseLevel() >= 3) { CheckComputation(nnet, *computation, true); KALDI_LOG << "Before optimization, max memory use (bytes) = " << GetMaxMemoryUse(*computation); @@ -594,15 +594,9 @@ void Optimize(const NnetOptimizeOptions &config, computation); if (GetVerboseLevel() >= 3) CheckComputation(nnet, *computation, false); - - { // TEMP - std::ostringstream os; - computation->Print(os, nnet); - KALDI_LOG << "Compuation after adding memory compression is: " << os.str(); - } } - if (GetVerboseLevel() >= 1) { // TEMP, should be 3 + if (GetVerboseLevel() >= 3) { CheckComputation(nnet, *computation, false); KALDI_LOG << "After optimization, max memory use (bytes) = " << GetMaxMemoryUse(*computation); From 8638ec9d7b2bf503a2fbfde4eb505354bd5a52b8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 20:30:36 -0500 Subject: [PATCH 099/184] [src] Change for memory efficiency to chain training; small cosmetic fixes. --- src/chain/chain-supervision-test.cc | 2 +- src/chain/chain-training.cc | 54 ++++++++++++++++------------- src/chain/chain-training.h | 18 +++++----- src/nnet3/nnet-chain-training.cc | 3 -- src/nnet3/nnet-compile-looped.cc | 1 - 5 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index 7bf3c17854a..d14c80cd84f 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -607,8 +607,8 @@ void TestRanges() { int main() { using namespace kaldi; SetVerboseLevel(1); - int32 loop = 0; #if HAVE_CUDA == 1 + int32 loop = 0; for (loop = 0; loop < 2; loop++) { CuDevice::Instantiate().SetDebugStrideMode(true); if (loop == 0) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 53de69a0e07..677e8f8d3dc 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -33,38 +33,44 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv) { - BaseFloat num_logprob_weighted; - if (nnet_output_deriv) + CuMatrix *xent_output_deriv) { + BaseFloat num_logprob_weighted, den_logprob_weighted; + bool ok = true; + if (nnet_output_deriv != NULL) nnet_output_deriv->SetZero(); + + { // Doing the denominator first helps to reduce the maximum + // memory use, as we can set 'xent_deriv' to nonempty after + // we've freed the memory in this object. + DenominatorComputation denominator(opts, den_graph, + supervision.num_sequences, + nnet_output); + + den_logprob_weighted = supervision.weight * denominator.Forward(); + if (nnet_output_deriv) + ok = denominator.Backward(-supervision.weight, + nnet_output_deriv); + } + + if (xent_output_deriv != NULL) + xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols()); + + { NumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from - // the numerator object, and the logprob too. + // the numerator object, as well as the returned logprob. num_logprob_weighted = numerator.Forward(); - if (nnet_output_deriv) { - numerator.Backward(nnet_output_deriv); - if (xent_output_deriv) - xent_output_deriv->CopyFromMat(*nnet_output_deriv); - } else if (xent_output_deriv) { - // this branch will be taken if xent_output_deriv but not - // nnet_output_deriv is set- which could happen if you want to compute the - // cross-entropy objective but not the derivatives. - xent_output_deriv->SetZero(); + + if (xent_output_deriv) { numerator.Backward(xent_output_deriv); + nnet_output_deriv->AddMat(1.0, *xent_output_deriv); + } else if (nnet_output_deriv) { + numerator.Backward(nnet_output_deriv); } } - DenominatorComputation denominator(opts, den_graph, - supervision.num_sequences, - nnet_output); - - BaseFloat den_logprob = denominator.Forward(); - bool ok = true; - if (nnet_output_deriv) - ok = denominator.Backward(-supervision.weight, - nnet_output_deriv); - *objf = num_logprob_weighted - supervision.weight * den_logprob; + *objf = num_logprob_weighted - den_logprob_weighted; *weight = supervision.weight * supervision.num_sequences * supervision.frames_per_sequence; if (!((*objf) - (*objf) == 0) || !ok) { @@ -86,7 +92,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, // for different frames of the sequences. As expected, they are // smaller towards the edges of the sequences (due to the penalization // of 'incorrect' pdf-ids. - if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) { + if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL && RandInt(0, 10) == 0) { int32 tot_frames = nnet_output_deriv->NumRows(), frames_per_sequence = supervision.frames_per_sequence, num_sequences = supervision.num_sequences; diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index e6143d10846..d6535902625 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -63,7 +63,7 @@ struct ChainTrainingOptions { ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), xent_regularize(0.0) { } - + void Register(OptionsItf *opts) { opts->Register("l2-regularize", &l2_regularize, "l2 regularization " "constant for 'chain' training, applied to the output " @@ -107,10 +107,13 @@ struct ChainTrainingOptions { You don't have to zero this before passing to this function, we zero it internally. @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative - (which equals a posterior from the numerator forward-backward, - scaled by the supervision weight) is written to here. This will - be used in the cross-entropy regularization code. This value - is also used in computing the cross-entropy objective value. + (which equals a posterior from the numerator + forward-backward, scaled by the supervision weight) + is written to here (this function will set it to the + correct size first; doing it this way reduces the + peak memory use). xent_output_deriv will be used in + the cross-entropy regularization code; it is also + used in computing the cross-entropy objective value. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -120,12 +123,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrixBase *xent_output_deriv = NULL); - + CuMatrix *xent_output_deriv = NULL); + } // namespace chain } // namespace kaldi #endif // KALDI_CHAIN_CHAIN_TRAINING_H_ - diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index f2eaff7e429..16d955fb2f7 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -220,9 +220,6 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2, bool use_xent = (opts_.chain_config.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix xent_deriv; - if (use_xent) - xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), - kUndefined); BaseFloat tot_objf, tot_l2_term, tot_weight; diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc index fa8a2322e5a..1a5ceabab0e 100644 --- a/src/nnet3/nnet-compile-looped.cc +++ b/src/nnet3/nnet-compile-looped.cc @@ -357,7 +357,6 @@ void CreateLoopedComputationRequestSimple(const Nnet &nnet, ComputationRequest *request1, ComputationRequest *request2, ComputationRequest *request3) { - bool has_ivector = (nnet.InputDim("ivector") > 0); int32 left_context, right_context; ComputeSimpleNnetContext(nnet, &left_context, &right_context); From 32b9f7b86b548587daccdabb2f158c37b46c65bd Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 22:16:26 -0500 Subject: [PATCH 100/184] [src] Fix to nnet-compute RE compression code. --- src/nnet3/nnet-compute.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 835b7fcfd88..19eecdda72b 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -382,7 +382,7 @@ void NnetComputer::ExecuteCommand() { } break; } - case kCompressMatrix: { + case kCompressMatrix: // This does nothing if CUDA is not in use. #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -399,9 +399,9 @@ void NnetComputer::ExecuteCommand() { compressed_matrices_[m]->CopyFromMat(matrices_[m]); matrices_[m].Resize(0, 0); } + break; #endif - } - case kDecompressMatrix: { + case kDecompressMatrix: #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { int32 m = computation_.submatrices[c.arg1].matrix_index; @@ -417,7 +417,7 @@ void NnetComputer::ExecuteCommand() { compressed_matrices_[m] = NULL; } #endif - } + break; case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker: case kNoOperationLabel: break; From c89812a7f3078654b4374269e7e21b042651d1bc Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 28 Jan 2018 23:59:58 -0500 Subject: [PATCH 101/184] [scripts] Add catch-all option 'trainer.add-option' --- .../libs/nnet3/train/chain_objf/acoustic_model.py | 9 +++++---- egs/wsj/s5/steps/libs/nnet3/train/common.py | 5 +++++ .../steps/libs/nnet3/train/frame_level_objf/common.py | 11 ++++++----- egs/wsj/s5/steps/nnet3/chain/train.py | 1 + egs/wsj/s5/steps/nnet3/train_dnn.py | 7 ++++--- egs/wsj/s5/steps/nnet3/train_raw_dnn.py | 1 + egs/wsj/s5/steps/nnet3/train_raw_rnn.py | 1 + egs/wsj/s5/steps/nnet3/train_rnn.py | 1 + 8 files changed, 24 insertions(+), 12 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index c63901367d6..3df2720b2c0 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -128,7 +128,7 @@ def train_new_models(dir, iter, srand, num_jobs, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, - frame_subsampling_factor, run_opts, + frame_subsampling_factor, run_opts, train_opts, backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from train_one_iteration(), this method trains new models @@ -184,7 +184,7 @@ def train_new_models(dir, iter, srand, num_jobs, --max-param-change={max_param_change} \ --backstitch-training-scale={backstitch_training_scale} \ --backstitch-training-interval={backstitch_training_interval} \ - --l2-regularize-factor={l2_regularize_factor} \ + --l2-regularize-factor={l2_regularize_factor} {train_opts} \ --srand={srand} \ "{raw_model}" {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs \ @@ -201,6 +201,7 @@ def train_new_models(dir, iter, srand, num_jobs, deriv_time_opts=" ".join(deriv_time_opts), app_deriv_wts=apply_deriv_weights, fr_shft=frame_shift, l2=l2_regularize, + train_opts=train_opts, xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, cache_io_opts=cache_io_opts, parallel_train_opts=run_opts.parallel_train_opts, @@ -233,7 +234,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, - run_opts, dropout_edit_string="", + run_opts, dropout_edit_string="", train_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -306,7 +307,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, shuffle_buffer_size=shuffle_buffer_size, num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, frame_subsampling_factor=frame_subsampling_factor, - run_opts=run_opts, + run_opts=run_opts, train_opts=train_opts, # linearly increase backstitch_training_scale during the # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 6c194a2c0a1..6f3e8877ae8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -903,6 +903,11 @@ def __init__(self, lstm*=0,0.2,0'. More general should precede less general patterns, as they are applied sequentially.""") + self.parser.add_argument("--trainer.add-option", type=str, + dest='train_opts', action='append', + help="""You can use this to add arbitrary options that + will be passed through to the core training code (nnet3-train + or nnet3-chain-train)""") self.parser.add_argument("--trainer.optimization.backstitch-training-scale", type=float, dest='backstitch_training_scale', default=0.0, help="""scale of parameters changes diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 72b776351f6..9dd12e63f52 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -33,7 +33,7 @@ def train_new_models(dir, iter, srand, num_jobs, image_augmentation_opts, run_opts, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, - use_multitask_egs=False, + use_multitask_egs=False, train_opts="", backstitch_training_scale=0.0, backstitch_training_interval=1): """ Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes files like @@ -144,7 +144,7 @@ def train_new_models(dir, iter, srand, num_jobs, --backstitch-training-scale={backstitch_training_scale} \ --l2-regularize-factor={l2_regularize_factor} \ --backstitch-training-interval={backstitch_training_interval} \ - --srand={srand} \ + --srand={srand} {train_opts} \ {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \ {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, @@ -159,6 +159,7 @@ def train_new_models(dir, iter, srand, num_jobs, l2_regularize_factor=1.0/num_jobs, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, + train_opts=train_opts, deriv_time_opts=" ".join(deriv_time_opts), raw_model=raw_model_string, egs_rspecifier=egs_rspecifier), @@ -177,9 +178,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, run_opts, image_augmentation_opts=None, frames_per_eg=-1, min_deriv_time=None, max_deriv_time_relative=None, - shrinkage_value=1.0, dropout_edit_string="", - get_raw_nnet_from_am=True, - use_multitask_egs=False, + shrinkage_value=1.0, dropout_edit_string="", train_opts="", + get_raw_nnet_from_am=True, use_multitask_egs=False, backstitch_training_scale=0.0, backstitch_training_interval=1, compute_per_dim_accuracy=False): """ Called from steps/nnet3/train_*.py scripts for one iteration of neural @@ -279,6 +279,7 @@ def train_one_iteration(dir, iter, srand, egs_dir, max_deriv_time_relative=max_deriv_time_relative, image_augmentation_opts=image_augmentation_opts, use_multitask_egs=use_multitask_egs, + train_opts=train_opts, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 011b6894938..9c90c3d6930 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -503,6 +503,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, num_chunk_per_minibatch_str=args.num_chunk_per_minibatch, apply_deriv_weights=args.apply_deriv_weights, diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 073ad3e7d7a..abd803c0e14 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -328,6 +328,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), minibatch_size_str=args.minibatch_size, frames_per_eg=args.frames_per_eg, momentum=args.momentum, @@ -365,16 +366,16 @@ def train(args, run_opts): egs_dir=egs_dir, minibatch_size_str=args.minibatch_size, run_opts=run_opts, max_objective_evaluations=args.max_objective_evaluations) - + if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") - + # If args.do_final_combination is true, we will use the combined model. # Otherwise, we will use the last_numbered model. real_iter = 'combined' if args.do_final_combination else num_iters avg_post_vec_file = train_lib.common.compute_average_posterior( - dir=args.dir, iter=real_iter, + dir=args.dir, iter=real_iter, egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts) diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 2d092ceebc7..d5b37871d70 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -356,6 +356,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), minibatch_size_str=args.minibatch_size, frames_per_eg=args.frames_per_eg, momentum=args.momentum, diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index b51632e7d2c..686b76aa7db 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -432,6 +432,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, min_deriv_time=min_deriv_time, diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 005e751cae0..2f49c6efff3 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -410,6 +410,7 @@ def train(args, run_opts): args.dropout_schedule, float(num_archives_processed) / num_archives_to_process, iter), + train_opts=' '.join(args.train_opts), shrinkage_value=shrinkage_value, minibatch_size_str=args.num_chunk_per_minibatch, min_deriv_time=min_deriv_time, From dbcabb8bcee6ef353141fde9c71cf45eda0b94ce Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 29 Jan 2018 00:22:59 -0500 Subject: [PATCH 102/184] [scripts] Small fix regarding --trainer.add-option option --- egs/wsj/s5/steps/libs/nnet3/train/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 6f3e8877ae8..443834fc161 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -904,7 +904,7 @@ def __init__(self, less general patterns, as they are applied sequentially.""") self.parser.add_argument("--trainer.add-option", type=str, - dest='train_opts', action='append', + dest='train_opts', action='append', default=[], help="""You can use this to add arbitrary options that will be passed through to the core training code (nnet3-train or nnet3-chain-train)""") From cf745735ec88da8fd9af74103685d6560dba9d4a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 29 Jan 2018 20:39:58 -0500 Subject: [PATCH 103/184] [src] Various bug-fixes relating to recent nnet3/chain changes. --- src/chain/chain-training.cc | 3 ++- src/nnet3/nnet-analyze.cc | 17 ++++++++++------- src/nnet3/nnet-optimize-utils.cc | 18 ++++++++++++------ src/nnet3/nnet-optimize.h | 7 ++++--- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 677e8f8d3dc..bf61bed67f0 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -64,7 +64,8 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, if (xent_output_deriv) { numerator.Backward(xent_output_deriv); - nnet_output_deriv->AddMat(1.0, *xent_output_deriv); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { numerator.Backward(nnet_output_deriv); } diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index cf48d3d86c6..ec1d3fa0f2e 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -634,21 +634,24 @@ void ComputationChecker::CheckComputationRewrite() const { Checks for the situation where a variable is read before being written. */ void ComputationChecker::CheckComputationUndefined() const { + // the variable 'min_proportion' needs to be <= the min_proportion_ value in + // class MatrixExtender, otherwise this code could spuriously reject a + // computation. + BaseFloat min_proportion = 0.8; + int32 num_variables = a_.variable_accesses.size(); for (int32 v = 0; v < num_variables; v++) { const std::vector &accesses = a_.variable_accesses[v]; if (accesses.empty()) { if (config_.check_unused_variables) { - // Before we throw an error, we want to check that it isn't - // a case that can be produced by the ExtendMatrices() - // optimization, that is actually allowed. This is a case - // when a variable is the last few rows of a matrix, but - // not all columns of those last rows. NnetComputation::SubMatrixInfo info = a_.variables.VariableInfo(v); const NnetComputation::MatrixInfo &matrix_info = computation_.matrices[info.matrix_index]; - if (info.row_offset > 0 && - info.num_rows + info.row_offset == matrix_info.num_rows && + // Before we throw an error, we want to check that it isn't a case that + // can be produced by the ExtendMatrices() optimization, that is + // actually allowed. This is a case when a variable is inside the last + // few rows of a matrix, but not all columns of those last rows. + if (info.row_offset >= min_proportion * matrix_info.num_rows && !(info.col_offset == 0 && info.num_cols == matrix_info.num_cols)) { continue; } diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 19ca31cf955..c53fba815fb 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -1064,6 +1064,9 @@ class MatrixExtender { std::vector is_input_or_output_; }; +// note: the initializer for min_proportion_ below needs to be kept in sync with +// the min_proportion variable in +// ComputationChecker::CheckComputationUndefined() in nnet-analyze.cc. MatrixExtender::MatrixExtender(NnetComputation *computation): min_proportion_(0.8), computation_(computation) { @@ -1111,9 +1114,10 @@ bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index, const MatrixInfo &src_matrix = computation_->matrices[src_submatrix.matrix_index]; - int32 dest_matrix_num_rows = orig_num_rows_[dest_submatrix.matrix_index]; + int32 dest_matrix_orig_num_rows = orig_num_rows_[dest_submatrix.matrix_index], + src_matrix_orig_num_rows = orig_num_rows_[src_submatrix.matrix_index]; - if (src_submatrix.num_rows < min_proportion_ * src_matrix.num_rows) + if (src_submatrix.num_rows < min_proportion_ * src_matrix_orig_num_rows) return false; // The following checks that the source submatrix covers be all of the @@ -1124,7 +1128,7 @@ bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index, src_submatrix.row_offset == 0 && src_submatrix.num_rows < src_matrix.num_rows && dest_submatrix.row_offset + dest_submatrix.num_rows == - dest_matrix_num_rows); + dest_matrix_orig_num_rows); } @@ -4614,9 +4618,11 @@ void OptimizeMemoryCompression(const Nnet &nnet, if (GetVerboseLevel() >= 2) { bytes_used_final = GetMaxMemoryUse(*computation); - KALDI_VLOG(2) << "Memory compression reduced memory use from " - << bytes_used_initial << " to " - << bytes_used_final << " bytes."; + if (bytes_used_final != bytes_used_initial) { + KALDI_VLOG(2) << "Memory compression reduced memory use from " + << bytes_used_initial << " to " + << bytes_used_final << " bytes."; + } } } } diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index cf308dd3b00..31872e46b72 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -75,7 +75,7 @@ struct NnetOptimizeOptions { max_deriv_time(std::numeric_limits::max()), max_deriv_time_relative(std::numeric_limits::max()), snip_row_ops(true), - memory_compression_level(0), + memory_compression_level(1), optimize_looped_computation(false) { } void Register(OptionsItf *opts) { @@ -133,10 +133,11 @@ struct NnetOptimizeOptions { "per-row operations"); opts->Register("memory-compression-level", &memory_compression_level, "This is only relevant to training, not decoding. Set this " - "to 0,1,2,3; higher levels are more aggressive at reducing " + "to 0,1,2; higher levels are more aggressive at reducing " "memory by compressing quantities needed for backprop, " "potentially at the expense of speed and the accuracy " - "of derivatives. 0 means no compression at all."); + "of derivatives. 0 means no compression at all; 1 means " + "compression that shouldn't affect results at all."); } void Read(std::istream &is, bool binary); From fe18a16e3540e1530900b7d4382bf4423fa949aa Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 29 Jan 2018 23:53:09 -0500 Subject: [PATCH 104/184] [egs] Add new example script --- .../s5c/local/chain/tuning/run_tdnn_7m23t.sh | 542 ++++++++++++++++++ 1 file changed, 542 insertions(+) create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh new file mode 100755 index 00000000000..f912b2d1175 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh @@ -0,0 +1,542 @@ +#!/bin/bash + +# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim. +# Differernce vs. 23r is unclear (maybe slightly worse), but it +# seems slightly better than 23h, and it's nice that it has fewer parameters. + + + +# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp +# System tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp +# WER on train_dev(tg) 12.28 11.95 12.18 +# WER on train_dev(fg) 11.21 10.97 11.12 +# WER on eval2000(tg) 15.0 15.0 14.9 +# WER on eval2000(fg) 13.5 13.6 13.5 +# WER on rt03(tg) 18.5 18.4 18.4 +# WER on rt03(fg) 16.1 15.9 16.2 +# Final train prob -0.083 -0.076 -0.077 +# Final valid prob -0.097 -0.091 -0.093 +# Final train prob (xent) -1.036 -0.978 -0.994 +# Final valid prob (xent) -1.0629 -1.0026 -1.0194 +# Num-parameters 23513380 23513380 20111396 + +# 7m23r is as 7m23h but with 6 epochs instead of 4. See also 7m23p, which +# had 3 epochs. + +# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'. +# Seems slightly better. The comparison below includes our old TDNN+LSTM result +# with dropout, to show that we're doing better than that now. + +# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp +# System tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp +# WER on train_dev(tg) 12.33 12.38 12.28 +# WER on train_dev(fg) 11.42 11.44 11.21 +# WER on eval2000(tg) 15.2 15.1 15.0 +# WER on eval2000(fg) 13.8 13.6 13.5 +# WER on rt03(tg) 18.6 18.4 18.5 +# WER on rt03(fg) 16.3 16.1 16.1 +# Final train prob -0.082 -0.084 -0.083 +# Final valid prob -0.099 -0.098 -0.097 +# Final train prob (xent) -0.959 -1.049 -1.036 +# Final valid prob (xent) -1.0305 -1.0661 -1.0629 +# Num-parameters 39558436 23120164 23513380 +# +# 7m23b2 is as 7m23b but fixing an issue at the last layers. +# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the +# splicing in 2 stages. Interestingly, objf is not better than 23, but +# WER is slightly better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# System tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp +# WER on train_dev(tg) 12.55 12.23 12.38 +# WER on train_dev(fg) 11.52 11.29 11.44 +# WER on eval2000(tg) 15.2 15.2 15.1 +# WER on eval2000(fg) 13.6 13.7 13.6 +# WER on rt03(tg) 18.6 18.7 18.4 +# WER on rt03(fg) 16.2 16.3 16.1 +# Final train prob -0.089 -0.083 -0.084 +# Final valid prob -0.101 -0.097 -0.098 +# Final train prob (xent) -1.080 -1.025 -1.049 +# Final valid prob (xent) -1.0990 -1.0548 -1.0661 +# Num-parameters 21055012 23120164 23120164 + + +# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and +# reducing the dim of the linear components... it's basically an attempt to +# reverse the factorization to have the splicing at a different point. +# + +# 7m19m is as 7m19l but with more skip connections +# Hm-- seems better than 19h. +# +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# System tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp +# WER on train_dev(tg) 12.61 12.72 12.55 +# WER on train_dev(fg) 11.72 11.62 11.52 +# WER on eval2000(tg) 15.4 15.4 15.2 +# WER on eval2000(fg) 13.7 13.8 13.6 +# WER on rt03(tg) 18.9 18.9 18.6 +# WER on rt03(fg) 16.3 16.4 16.2 +# Final train prob -0.091 -0.091 -0.089 +# Final valid prob -0.102 -0.103 -0.101 +# Final train prob (xent) -1.098 -1.095 -1.080 +# Final valid prob (xent) -1.1031 -1.1191 -1.0990 +# Num-parameters 21055012 20268580 21055012 +# +# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before +# doing the Append... doing this by inserting a linear-component between +# pairs of relu-batchnorm-layers. +# A little worse. +# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp +# System tdnn7m19h_sp tdnn7m19l_sp +# WER on train_dev(tg) 12.65 12.72 +# WER on train_dev(fg) 11.57 11.62 +# WER on eval2000(tg) 15.3 15.4 +# WER on eval2000(fg) 13.7 13.8 +# WER on rt03(tg) 18.8 18.9 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.091 -0.091 +# Final valid prob -0.102 -0.103 +# Final train prob (xent) -1.091 -1.095 +# Final valid prob (xent) -1.1064 -1.1191 +# Num-parameters 21055012 20268580 + + +# 7m19h is as 7m19e but with an extra bypass connection. A bit better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp +# System tdnn7m19e_sp tdnn7m19h_sp +# WER on train_dev(tg) 12.75 12.65 +# WER on train_dev(fg) 11.77 11.57 +# WER on eval2000(tg) 15.5 15.3 +# WER on eval2000(fg) 14.0 13.7 +# WER on rt03(tg) 18.9 18.8 +# WER on rt03(fg) 16.4 16.4 +# Final train prob -0.092 -0.091 +# Final valid prob -0.102 -0.102 +# Final train prob (xent) -1.094 -1.091 +# Final valid prob (xent) -1.1095 -1.1064 +# Num-parameters 20760100 21055012 + +# 7m19e is as 7m19c,d but with dims increased to 1536. Better! + +# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# System tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp +# WER on train_dev(tg) 13.77 12.86 13.01 12.75 +# WER on train_dev(fg) 12.65 11.82 12.02 11.77 +# WER on eval2000(tg) 16.1 15.4 15.7 15.5 +# WER on eval2000(fg) 14.3 13.8 14.0 14.0 +# WER on rt03(tg) 19.9 19.1 19.2 18.9 +# WER on rt03(fg) 17.4 16.6 16.7 16.4 +# Final train prob -0.111 -0.094 -0.096 -0.092 +# Final valid prob -0.120 -0.103 -0.105 -0.102 +# Final train prob (xent) -1.314 -1.117 -1.144 -1.094 +# Final valid prob (xent) -1.3247 -1.1223 -1.1478 -1.1095 +# Num-parameters 13361700 17824036 14887972 20760100 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# System tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp +# WER on train_dev(tg) 13.37 13.09 12.93 12.86 13.01 +# WER on train_dev(fg) 12.47 12.12 11.87 11.82 12.02 +# WER on eval2000(tg) 15.8 15.8 15.6 15.4 15.7 +# WER on eval2000(fg) 14.3 14.3 14.0 13.8 14.0 +# WER on rt03(tg) 15.1 14.8 14.9 14.8 14.9 +# WER on rt03(fg) 12.7 12.4 12.5 12.5 12.6 +# Final train prob -0.099 -0.096 -0.096 -0.094 -0.096 +# Final valid prob -0.110 -0.106 -0.106 -0.103 -0.105 +# Final train prob (xent) -1.302 -1.198 -1.188 -1.117 -1.144 +# Final valid prob (xent) -1.3184 -1.2070 -1.1980 -1.1223 -1.1478 +# Num-parameters 14216996 15528996 16512036 17824036 14887972 + +# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up). +# Seems about 0.1% better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# System tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp +# WER on train_dev(tg) 13.09 12.93 12.86 +# WER on train_dev(fg) 12.12 11.87 11.82 +# WER on eval2000(tg) 15.8 15.6 15.4 +# WER on eval2000(fg) 14.3 14.0 13.8 +# WER on rt03(tg) 14.8 14.9 14.8 +# WER on rt03(fg) 12.4 12.5 12.5 +# Final train prob -0.096 -0.096 -0.094 +# Final valid prob -0.106 -0.106 -0.103 +# Final train prob (xent) -1.198 -1.188 -1.117 +# Final valid prob (xent) -1.2070 -1.1980 -1.1223 +# Num-parameters 15528996 16512036 17824036 + +# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp +# System tdnn7m19_sp tdnn7m19b_sp +# WER on train_dev(tg) 13.09 12.93 +# WER on train_dev(fg) 12.12 11.87 +# WER on eval2000(tg) 15.8 15.6 +# WER on eval2000(fg) 14.3 14.0 +# WER on rt03(tg) 14.8 14.9 +# WER on rt03(fg) 12.4 12.5 +# Final train prob -0.096 -0.096 +# Final valid prob -0.106 -0.106 +# Final train prob (xent) -1.198 -1.188 +# Final valid prob (xent) -1.2070 -1.1980 +# Num-parameters 15528996 16512036 + +# 7m19 is as 7m16 but adding an extra -3,0,3 layer. +# CAUTION: messing with queue opts. +# 7m16 is as 7m15 but removing the chain l2-regularize. Does seem better. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# System tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp +# WER on train_dev(tg) 13.58 13.50 13.37 +# WER on train_dev(fg) 12.43 12.44 12.47 +# WER on eval2000(tg) 16.0 16.0 15.8 +# WER on eval2000(fg) 14.3 14.3 14.3 +# WER on rt03(tg) 15.2 15.4 15.1 +# WER on rt03(fg) 13.0 13.0 12.7 +# Final train prob -0.109 -0.111 -0.099 +# Final valid prob -0.117 -0.119 -0.110 +# Final train prob (xent) -1.278 -1.291 -1.302 +# Final valid prob (xent) -1.2880 -1.3036 -1.3184 +# Num-parameters 16089380 14216996 14216996 + +# 7m15 is as 7m12 but reducing the bottleneck dim at the output from +# 384 to 256 (like 11->14). +# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280. +# Seems a little better but could be due to the increase in parameters. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# System tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp +# WER on train_dev(tg) 13.60 13.88 13.77 13.83 13.58 +# WER on train_dev(fg) 12.62 12.64 12.65 12.65 12.43 +# WER on eval2000(tg) 16.8 16.1 16.1 16.1 16.0 +# WER on eval2000(fg) 15.4 14.4 14.3 14.5 14.3 +# WER on rt03(tg) 16.2 15.5 15.6 15.3 15.2 +# WER on rt03(fg) 13.7 13.1 13.2 13.0 13.0 +# Final train prob -0.105 -0.111 -0.111 -0.109 -0.109 +# Final valid prob -0.115 -0.119 -0.120 -0.118 -0.117 +# Final train prob (xent) -1.282 -1.309 -1.314 -1.292 -1.278 +# Final valid prob (xent) -1.3194 -1.3246 -1.3247 -1.3077 -1.2880 +# Num-parameters 11580452 13818148 13361700 13809188 16089380 + +# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks. +# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers. +# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims. + +# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp +# System tdnn7m8_sp tdnn7m9_sp +# WER on train_dev(tg) 13.60 13.88 +# WER on train_dev(fg) 12.62 12.64 +# WER on eval2000(tg) 16.8 16.1 +# WER on eval2000(fg) 15.4 14.4 +# WER on rt03(tg) 16.2 15.5 +# WER on rt03(fg) 13.7 13.1 +# Final train prob -0.105 -0.111 +# Final valid prob -0.115 -0.119 +# Final train prob (xent) -1.282 -1.309 +# Final valid prob (xent) -1.3194 -1.3246 +# Num-parameters 11580452 13818148 + +# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which +# is the same as 7m2->7m3, which was helpful there. +# Does seem helpful. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp +# WER on train_dev(tg) 13.70 13.74 13.81 13.60 +# WER on train_dev(fg) 12.67 12.76 12.74 12.62 +# WER on eval2000(tg) 16.6 17.1 17.0 16.8 +# WER on eval2000(fg) 15.1 15.4 15.4 15.4 +# WER on rt03(tg) 16.1 16.2 16.0 16.2 +# WER on rt03(fg) 13.7 13.8 13.6 13.7 +# Final train prob -0.085 -0.106 -0.104 -0.105 +# Final valid prob -0.103 -0.118 -0.116 -0.115 +# Final train prob (xent) -1.230 -1.296 -1.285 -1.282 +# Final valid prob (xent) -1.2704 -1.3318 -1.3283 -1.3194 +# Num-parameters 16292693 10924836 11580452 11580452 + + +# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values. +# WER changes (+ is worse): +1 +1 +2 +3 -2 -2... so maybe worse on average, +# but not clear at all... for consistency with other setups I may retain +# this change. + +# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# System tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp +# WER on train_dev(tg) 13.70 13.74 13.71 13.81 +# WER on train_dev(fg) 12.67 12.76 12.64 12.74 +# WER on eval2000(tg) 16.6 17.1 16.8 17.0 +# WER on eval2000(fg) 15.1 15.4 15.1 15.4 +# WER on rt03(tg) 16.1 16.2 16.2 16.0 +# WER on rt03(fg) 13.7 13.8 13.8 13.6 +# Final train prob -0.085 -0.106 -0.103 -0.104 +# Final valid prob -0.103 -0.118 -0.114 -0.116 +# Final train prob (xent) -1.230 -1.296 -1.274 -1.285 +# Final valid prob (xent) -1.2704 -1.3318 -1.3016 -1.3283 +# Num-parameters 16292693 10924836 12170788 11580452 + + +# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer +# and the prefinal layers from 512 to 768. +# 7m2 is as 7m but with a bunch of tuning changes (model is smaller). +# 7m is as 7k but adding two non-splicing layers towards the beginning of the +# network. +# The impovement is pretty small but I've seen similar improvements on other +# setups with this architecture so I tend to believe it. + + +# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp +# System tdnn_7k_sp tdnn_7m_sp +# WER on train_dev(tg) 13.83 13.65 +# WER on train_dev(fg) 12.74 12.54 +# WER on eval2000(tg) 16.9 16.8 +# WER on eval2000(fg) 15.2 15.1 +# Final train prob -0.085 -0.084 +# Final valid prob -0.107 -0.103 +# Final train prob (xent) -1.267 -1.215 +# Final valid prob (xent) -1.3107 -1.2735 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp +# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7m23t +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn4l, tdnn2l) + linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; From e14c4b41eb7e40d755e3fd8d4e96aa63183830fb Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 30 Jan 2018 12:04:44 -0500 Subject: [PATCH 105/184] Add phone-set compatibility checks for nnet3 models --- egs/wsj/s5/steps/nnet3/chain/train.py | 4 ++++ egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh | 3 +++ egs/wsj/s5/steps/nnet3/decode.sh | 7 +++++++ egs/wsj/s5/steps/nnet3/train_dnn.py | 10 +++++++--- egs/wsj/s5/steps/nnet3/train_rnn.py | 4 ++++ egs/wsj/s5/steps/nnet3/train_tdnn.sh | 3 +++ .../s5/utils/lang/check_phones_compatible.sh | 19 ++++++++----------- 7 files changed, 36 insertions(+), 14 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6896da67f73..6dcd674bac0 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -274,6 +274,10 @@ def train(args, run_opts): chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, args.lat_dir) + # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will + # need it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir) + # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.tree_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 4ba8cae2d56..8eabe9c33e6 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -158,6 +158,9 @@ for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +# Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will +# need it to check compatibility between training and decoding phone-sets. +cp $treedir/phones.txt $dir # Set some variables. nj=`cat $treedir/num_jobs` || exit 1; # number of jobs in alignment dir... diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 50e02629db0..27256ca5964 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -70,6 +70,13 @@ if [ ! -z "$online_ivector_dir" ]; then extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" fi +if [ ! -f $srcdir/phones.txt ]; then + echo >&2 "$0: WARNING: The model directory '$srcdir' does not contain phones.txt." + echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping the" + echo >&2 "$0: WARNING: phone-sets compatible between the trained model and the decoding graph." +fi +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index 073ad3e7d7a..ccd9c82b622 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -162,6 +162,10 @@ def train(args, run_opts): arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) + # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will + # need it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir) + # Set some variables. # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir) num_jobs = common_lib.get_number_of_jobs(args.ali_dir) @@ -365,16 +369,16 @@ def train(args, run_opts): egs_dir=egs_dir, minibatch_size_str=args.minibatch_size, run_opts=run_opts, max_objective_evaluations=args.max_objective_evaluations) - + if args.stage <= num_iters + 1: logger.info("Getting average posterior for purposes of " "adjusting the priors.") - + # If args.do_final_combination is true, we will use the combined model. # Otherwise, we will use the last_numbered model. real_iter = 'combined' if args.do_final_combination else num_iters avg_post_vec_file = train_lib.common.compute_average_posterior( - dir=args.dir, iter=real_iter, + dir=args.dir, iter=real_iter, egs_dir=egs_dir, num_archives=num_archives, prior_subset_size=args.prior_subset_size, run_opts=run_opts) diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 005e751cae0..3e0f03f7de1 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -218,6 +218,10 @@ def train(args, run_opts): arg_string = pprint.pformat(vars(args)) logger.info("Arguments for the experiment\n{0}".format(arg_string)) + # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will + # need it to check compatibility between training and decoding phone-sets. + shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir) + # Set some variables. num_jobs = common_lib.get_number_of_jobs(args.ali_dir) feat_dim = common_lib.get_feat_dim(args.feat_dir) diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh index fbcf426b205..6537c7c659f 100755 --- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh @@ -148,6 +148,9 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +# Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will +# need it to check compatibility between training and decoding phone-sets. +cp $alidir/phones.txt $dir # Set some variables. num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 diff --git a/egs/wsj/s5/utils/lang/check_phones_compatible.sh b/egs/wsj/s5/utils/lang/check_phones_compatible.sh index 18301a900c5..cfad06d2b8c 100755 --- a/egs/wsj/s5/utils/lang/check_phones_compatible.sh +++ b/egs/wsj/s5/utils/lang/check_phones_compatible.sh @@ -18,11 +18,8 @@ # except for possible differences in disambiguation symbols (meaning that all # symbols except those beginning with a # are mapped to the same values). # Otherwise it prints a warning and exits with status 1. -# For the sake of compatibility with other scripts that did not write the -# phones.txt to model directories, this script exits silently with status 0 -# if one of the phone symbol tables does not exist. -# For the sake of compatibility with other scripts that did not write the -# phones.txt to model directories, this script exits silently with status 0 +# For the sake of compatibility with other scripts that did not write the +# phones.txt to model directories, this script exits silently with status 0 # if one of the phone symbol tables does not exist. . utils/parse_options.sh || exit 1; @@ -36,24 +33,24 @@ fi table_first=$1 table_second=$2 -# check the files exist or not +# check if the files exist or not if [ ! -f $table_first ]; then if [ ! -f $table_second ]; then echo "$0: Error! Both of the two phones-symbol tables are absent." echo "Please check your command" exit 1; else - #The phones-symbol-table1 is absent. The model directory maybe created by old script. - #For back compatibility, this script exits silently with status 0. + # The phones-symbol-table1 is absent. The model directory maybe created by old script. + # For back compatibility, this script exits silently with status 0. exit 0; fi elif [ ! -f $table_second ]; then - #The phones-symbol-table2 is absent. The model directory maybe created by old script. - #For back compatibility, this script exits silently with status 0. + # The phones-symbol-table2 is absent. The model directory maybe created by old script. + # For back compatibility, this script exits silently with status 0. exit 0; fi -#Check the two tables are same or not (except for possible difference in disambiguation symbols). +# Check if the two tables are the same (except for possible difference in disambiguation symbols). if ! cmp -s <(grep -v "^#" $table_first) <(grep -v "^#" $table_second); then echo "$0: phone symbol tables $table_first and $table_second are not compatible." exit 1; From e4fc87d149e575c1eddc0dcd3412b82c01362d45 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Tue, 30 Jan 2018 20:32:48 +0100 Subject: [PATCH 106/184] [scripts] bugfix for 'steps/cleanup/clean_and_segment_data.sh', (#2196) - make sure that lattice-generation, and subsequent search of 'oracle-transcript' in them uses the same data-split. --- egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh index 29d52588807..4a9d43a51b5 100755 --- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh +++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh @@ -72,9 +72,9 @@ fi nj=$(cat $latdir/num_jobs) oov=$(cat $lang/oov.int) -utils/split_data.sh --per-utt $data $nj +utils/split_data.sh $data $nj -sdata=$data/split${nj}utt +sdata=$data/split$nj; if [ $stage -le 1 ]; then $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \ From 7bfafa46f2d52dec0d12fdad649c69e0a725c4b2 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 30 Jan 2018 18:38:57 -0500 Subject: [PATCH 107/184] Some changes in the previous commit --- egs/wsj/s5/steps/nnet3/chain/train.py | 2 +- egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh | 2 +- egs/wsj/s5/steps/nnet3/decode.sh | 5 ----- egs/wsj/s5/steps/nnet3/train_dnn.py | 2 +- egs/wsj/s5/steps/nnet3/train_rnn.py | 2 +- egs/wsj/s5/steps/nnet3/train_tdnn.sh | 2 +- 6 files changed, 5 insertions(+), 10 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6dcd674bac0..82ea2771048 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -275,7 +275,7 @@ def train(args, run_opts): args.lat_dir) # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will - # need it to check compatibility between training and decoding phone-sets. + # use it to check compatibility between training and decoding phone-sets. shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir) # Set some variables. diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 8eabe9c33e6..f5340fb4611 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -159,7 +159,7 @@ for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \ done # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will -# need it to check compatibility between training and decoding phone-sets. +# use it to check compatibility between training and decoding phone-sets. cp $treedir/phones.txt $dir # Set some variables. diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 27256ca5964..8c520e0b5e1 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -70,11 +70,6 @@ if [ ! -z "$online_ivector_dir" ]; then extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" fi -if [ ! -f $srcdir/phones.txt ]; then - echo >&2 "$0: WARNING: The model directory '$srcdir' does not contain phones.txt." - echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping the" - echo >&2 "$0: WARNING: phone-sets compatible between the trained model and the decoding graph." -fi utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index ccd9c82b622..0fe0e4ef445 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -163,7 +163,7 @@ def train(args, run_opts): logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will - # need it to check compatibility between training and decoding phone-sets. + # use it to check compatibility between training and decoding phone-sets. shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir) # Set some variables. diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 3e0f03f7de1..78d4eb98d16 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -219,7 +219,7 @@ def train(args, run_opts): logger.info("Arguments for the experiment\n{0}".format(arg_string)) # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will - # need it to check compatibility between training and decoding phone-sets. + # use it to check compatibility between training and decoding phone-sets. shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir) # Set some variables. diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh index 6537c7c659f..f023d38b26c 100755 --- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh @@ -149,7 +149,7 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/ done # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will -# need it to check compatibility between training and decoding phone-sets. +# use it to check compatibility between training and decoding phone-sets. cp $alidir/phones.txt $dir # Set some variables. From 1647856ee45282bf0e20dbd49c7505a9a8c36d4b Mon Sep 17 00:00:00 2001 From: Pavel Denisov Date: Wed, 31 Jan 2018 23:25:12 +0100 Subject: [PATCH 108/184] [egs] Small bug-fix in Librispeech recipe (#2190) --- egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh index 8546955c93c..cd26773f50f 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh @@ -173,8 +173,6 @@ if [ $stage -le 15 ]; then /export/b0{5,6,7,8}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi - touch $dir/egs/.nodelete # keep egs around when that run dies. - steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --feat.online-ivector-dir $train_ivector_dir \ From 2de3b38666a69774f4896bc96a2b62e3f62c63fe Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Wed, 31 Jan 2018 20:22:24 -0800 Subject: [PATCH 109/184] [src] Make arpa2fst robust against ARPA files without . (#2167) --- .gitignore | 1 + src/lm/arpa-lm-compiler-test.cc | 14 ++++++++++++++ src/lm/arpa-lm-compiler.cc | 8 ++++++++ src/lm/arpa-lm-compiler.h | 1 + src/lm/test_data/missing_bos.arpa | 18 ++++++++++++++++++ 5 files changed, 42 insertions(+) create mode 100644 src/lm/test_data/missing_bos.arpa diff --git a/.gitignore b/.gitignore index 940a571d2ca..bd6410c4aab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Compiled extensionless executable files in /src/*/ # This stanza must precede wildcard patterns below! /src/*/* +!/src/lm/test_data/ !/src/*/?*.* !/src/doc/* !/src/*/Makefile diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc index 21239afb475..697d70c416a 100644 --- a/src/lm/arpa-lm-compiler-test.cc +++ b/src/lm/arpa-lm-compiler-test.cc @@ -204,6 +204,17 @@ bool ScoringTest(bool seps, const string &infile, const string& sentence, return ok; } +bool ThrowsExceptionTest(bool seps, const string &infile) { + try { + // Make memory cleanup easy in both cases of try-catch block. + std::unique_ptr compiler(Compile(seps, infile)); + return false; + } catch (const std::runtime_error&) { + // Kaldi throws only std::runtime_error in kaldi-error.cc + return true; + } +} + } // namespace kaldi bool RunAllTests(bool seps) { @@ -214,6 +225,9 @@ bool RunAllTests(bool seps) { ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "b b b a", 59.2649); ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "a b", 4.36082); + + ok &= kaldi::ThrowsExceptionTest(seps, "test_data/missing_bos.arpa"); + if (!ok) { KALDI_WARN << "Tests " << (seps ? "with" : "without") << " epsilon substitution FAILED"; diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc index c854b077d00..d774deeb783 100644 --- a/src/lm/arpa-lm-compiler.cc +++ b/src/lm/arpa-lm-compiler.cc @@ -360,10 +360,18 @@ void ArpaLmCompiler::RemoveRedundantStates() { << fst_.NumStates(); } +void ArpaLmCompiler::Check() const { + if (fst_.Start() == fst::kNoStateId) { + KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol " + << Symbols()->Find(Options().bos_symbol) << "."; + } +} + void ArpaLmCompiler::ReadComplete() { fst_.SetInputSymbols(Symbols()); fst_.SetOutputSymbols(Symbols()); RemoveRedundantStates(); + Check(); } } // namespace kaldi diff --git a/src/lm/arpa-lm-compiler.h b/src/lm/arpa-lm-compiler.h index 35fd52d6cf3..3e3baeb6ee1 100644 --- a/src/lm/arpa-lm-compiler.h +++ b/src/lm/arpa-lm-compiler.h @@ -52,6 +52,7 @@ class ArpaLmCompiler : public ArpaFileParser { // this function removes states that only have a backoff arc coming // out of them. void RemoveRedundantStates(); + void Check() const; int sub_eps_; ArpaLmCompilerImplInterface* impl_; // Owned. diff --git a/src/lm/test_data/missing_bos.arpa b/src/lm/test_data/missing_bos.arpa new file mode 100644 index 00000000000..487061a49a4 --- /dev/null +++ b/src/lm/test_data/missing_bos.arpa @@ -0,0 +1,18 @@ + +\data\ +ngram 1=3 +ngram 2=1 +ngram 3=1 + +\1-grams: +-5.234679 a -3.3 +-3.456783 b -3.0 +-4.333333 + +\2-grams: +-1.45678 a b -3.23 + +\3-grams: +-0.23940 a b + +\end\ From c82560ddb2add09878c20df6d4b04c7f23f8c010 Mon Sep 17 00:00:00 2001 From: Matthew Maciejewski Date: Thu, 1 Feb 2018 18:11:44 -0500 Subject: [PATCH 110/184] [scripts] Fixed small issue get_uniform_subsegments.py (RE rounding) (#2200) --- egs/wsj/s5/utils/data/get_uniform_subsegments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/utils/data/get_uniform_subsegments.py b/egs/wsj/s5/utils/data/get_uniform_subsegments.py index adf042f3d4d..c61b96e0dbb 100755 --- a/egs/wsj/s5/utils/data/get_uniform_subsegments.py +++ b/egs/wsj/s5/utils/data/get_uniform_subsegments.py @@ -87,8 +87,8 @@ def run(args): else: end = end_time new_utt = "{utt_id}-{s:08d}-{e:08d}".format( - utt_id=utt_id, s=int(100 * (start - start_time)), - e=int(100 * (end - start_time))) + utt_id=utt_id, s=int(round(100 * (start - start_time))), + e=int(round(100 * (end - start_time)))) print ("{new_utt} {utt_id} {s} {e}".format( new_utt=new_utt, utt_id=utt_id, s=start - start_time, e=end - start_time)) From b4fbe00b0b827d7b9f914ab6b82b29a903a16144 Mon Sep 17 00:00:00 2001 From: Yiming Wang Date: Fri, 2 Feb 2018 22:48:53 -0500 Subject: [PATCH 111/184] [egs] Add assert to check --backstitch-training-interval option (#2203) (#2204) --- src/nnet3/nnet-chain-training.cc | 3 ++- src/nnet3/nnet-training.cc | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 780a7115a8a..3e6d8599382 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -37,7 +37,8 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, if (opts.nnet_config.zero_component_stats) ZeroComponentStats(nnet); KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 && - opts.nnet_config.max_param_change >= 0.0); + opts.nnet_config.max_param_change >= 0.0 && + opts.nnet_config.backstitch_training_interval > 0); delta_nnet_ = nnet_->Copy(); ScaleNnet(0.0, delta_nnet_); const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 30cd47b3eb2..a9093523222 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -34,7 +34,8 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, if (config.zero_component_stats) ZeroComponentStats(nnet); KALDI_ASSERT(config.momentum >= 0.0 && - config.max_param_change >= 0.0); + config.max_param_change >= 0.0 && + config.backstitch_training_interval > 0); delta_nnet_ = nnet_->Copy(); ScaleNnet(0.0, delta_nnet_); const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); From 9e2d8442bfee9fca88c1fad4f138fcaef0ac1e3f Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 4 Feb 2018 05:22:04 +0330 Subject: [PATCH 112/184] [egs,scripts] Fix and simplify speed-perturbation scripts; fix permissions. (#2205) --- .../s5/local/chain/compare_wer_general.sh | 0 .../s5/local/chain/run_blstm_6h.sh | 0 .../s5/local/chain/run_blstm_6j.sh | 0 egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh | 0 .../s5/local/chain/run_tdnn_lstm_1a.sh | 0 .../s5/local/chain/run_tdnn_lstm_1b.sh | 0 .../s5/local/chain/run_tdnn_opgru_1a.sh | 0 .../s5/local/chain/run_tdnn_opgru_1b.sh | 0 .../s5c/local/nnet3/run_ivector_common.sh | 45 +++++++------------ .../utils/data/perturb_data_dir_speed_3way.sh | 12 ++++- 10 files changed, 27 insertions(+), 30 deletions(-) mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/compare_wer_general.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh diff --git a/egs/fisher_swbd/s5/local/chain/compare_wer_general.sh b/egs/fisher_swbd/s5/local/chain/compare_wer_general.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh old mode 100644 new mode 100755 diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh old mode 100644 new mode 100755 diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh index 5e132830cfb..d45095ec85b 100755 --- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh +++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh @@ -4,49 +4,38 @@ set -e stage=1 train_stage=-10 -generate_alignments=true # false if doing ctc training +generate_alignments=true speed_perturb=true . ./path.sh . ./utils/parse_options.sh -mkdir -p nnet3 -# perturbed data preparation +mkdir -p exp/nnet3 train_set=train_nodup if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi -if [ "$speed_perturb" == "true" ]; then +if $speed_perturb; then if [ $stage -le 1 ]; then - #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignments # _sp stands for speed-perturbed - - for datadir in train_nodup; do - utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 - utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 - utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 - utils/validate_data_dir.sh --no-feats data/${datadir}_tmp - rm -r data/temp1 data/temp2 - - mfccdir=mfcc_perturbed - steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ - data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - utils/fix_data_dir.sh data/${datadir}_tmp - - utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 - utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 - utils/fix_data_dir.sh data/${datadir}_sp - rm -r data/temp0 data/${datadir}_tmp - done + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + + echo "$0: creating MFCC features for low-resolution speed-perturbed data" + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${train_set}_sp exp/make_mfcc/${train_set}_sp $mfccdir + steps/compute_cmvn_stats.sh data/${train_set}_sp exp/make_mfcc/${train_set}_sp $mfccdir + utils/fix_data_dir.sh data/${train_set}_sp fi - if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then - #obtain the alignment of the perturbed data + if [ $stage -le 2 ] && $generate_alignments; then + # obtain the alignment of the perturbed data steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/train_nodup_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp || exit 1 + data/${train_set}_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp fi - train_set=train_nodup_sp + train_set=${train_set}_sp fi if [ $stage -le 3 ]; then diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh index 5b007cadb3f..048220d62fd 100755 --- a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh @@ -38,9 +38,17 @@ utils/data/get_utt2dur.sh ${srcdir} utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1 utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1 -utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1 -rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 +utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- ${srcdir} ${destdir}_speed1.0 +if [ ! -f $srcdir/utt2uniq ]; then + cat $srcdir/utt2spk | awk '{printf("sp1.0-%s %s\n", $1, $1);}' > ${destdir}_speed1.0/utt2uniq +else + cat $srcdir/utt2uniq | awk '{printf("sp1.0-%s %s\n", $1, $2);}' > ${destdir}_speed1.0/utt2uniq +fi + +utils/data/combine_data.sh $destdir ${destdir}_speed1.0 ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1 + +rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 ${destdir}_speed1.0 echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir" utils/validate_data_dir.sh --no-feats --no-text $destdir From 79065901b7d4a58d757dcba4fcdac89a374caff2 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 6 Feb 2018 16:29:56 -0500 Subject: [PATCH 113/184] [doc] remove outdated TODOs. Thx: David Van Leeuwen. --- src/doc/chain.dox | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/src/doc/chain.dox b/src/doc/chain.dox index 687d9a2d7a7..ca2efce3627 100644 --- a/src/doc/chain.dox +++ b/src/doc/chain.dox @@ -389,34 +389,7 @@ on the paths. You might notice in the current example scripts that we use iVectors. We do so just because they generally help a bit, and because the baseline setup we were comparing with, uses them. There is no inherent connection with 'chain' - models, and no fundamental requirement to use them. Actually we want to get rid - of them (see below). - - - \section chain_next_steps Next steps (TODOs) with 'chain' models - - (Note: this list is valid as of Dec 13 2015, but may become out of date). - Things we need to do (and that we'd like help with) are: - - Supply example scripts (and tune them) on a wide range of corpora - (It will be interesting to see whether there are scale-dependent effects - affecting how well this model works). - - Create and tune LSTM and BLSTM versions of the training script. (This - may involve some playing around with learning rate schedules and - configurations). - - Figure out how to speed up the forward-backward part of the computation. - (E.g. using state-level pruning, or just by optimizing the current kernels or - data structures). - - A longer-term TODO, which Dan should do, is to create an online decoding setup - for these models. Actually this isn't really distinct from nnet3 online - decoding in general, since the models are no different from regular nnet3 - acoustic models. But we do have to decide whether to continue to support - iVectors-- getting rid of them would simplify the setup considerably, and - would hopefully make it more robust. We are hoping that with LSTMs, since it - already sees quite a wide acoustic context, iVector adaptation will no longer - be as helpful and could be dropped. We also have other ideas how to - incorporate adaptation as part of the neural network, without the use of - iVectors. This will require some experimentation. + models, and no fundamental requirement to use them. */ From 027f1d79611395183af184d50e1dc82552480203 Mon Sep 17 00:00:00 2001 From: Kuang R Date: Wed, 7 Feb 2018 14:49:55 +0800 Subject: [PATCH 114/184] [scripts] Add missing import statement in script (#2207) --- egs/wsj/s5/steps/libs/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index b84fa46f622..1e8e2ced6ce 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -14,7 +14,6 @@ import logging import math import os -import re import subprocess import sys import threading From 8e170e039c102a7b52cc4047296c21b6f3055227 Mon Sep 17 00:00:00 2001 From: Xiaohui Zhang Date: Wed, 7 Feb 2018 16:31:17 -0500 Subject: [PATCH 115/184] =?UTF-8?q?[egs]=20multi=5Fen:=20Fixed=20acronym?= =?UTF-8?q?=20normalization,=20swbd=20lexicon=20preparation,=20OOV=20?= =?UTF-8?q?=E2=80=A6=20(#2137)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * multi_en: Fixed acronym normalization, swbd lexicon preparation, OOV pronunciation generation, acoustic data sub-sampling,.etc; Added hub4_97 data --- egs/multi_en/s5/README.md | 2 +- egs/multi_en/s5/RESULTS | 39 +-- egs/multi_en/s5/conf/mfcc.conf | 1 + egs/multi_en/s5/local/g2p/apply_g2p.sh | 2 +- egs/multi_en/s5/local/hub4_96_data_prep.sh | 52 ++++ egs/multi_en/s5/local/hub4_96_parse_sgm.pl | 235 +++++++++++++++++ egs/multi_en/s5/local/hub4_97_data_prep.sh | 50 ++++ egs/multi_en/s5/local/hub4_97_parse_sgm.pl | 235 +++++++++++++++++ egs/multi_en/s5/local/hub4_data_prep.py | 242 ------------------ egs/multi_en/s5/local/hub4_en_data_prep.sh | 62 +++++ egs/multi_en/s5/local/hub4_format_data.pl | 138 ++++++++++ .../local/hub4_normalize_bn96_transcripts.pl | 33 +++ .../local/hub4_normalize_bn97_transcripts.pl | 42 +++ egs/multi_en/s5/local/hub4_utils.py | 174 ------------- .../s5/local/librispeech_data_prep.sh | 13 +- egs/multi_en/s5/local/make_partitions.sh | 9 +- egs/multi_en/s5/local/swbd1_data_prep.sh | 10 +- egs/multi_en/s5/local/tedlium_prepare_data.sh | 3 +- egs/multi_en/s5/local/wsj_data_prep.sh | 3 +- egs/multi_en/s5/run.sh | 117 ++++----- egs/swbd/s5c/local/score_basic.sh | 3 +- egs/wsj/s5/utils/data/resample_data_dir.sh | 7 + 22 files changed, 943 insertions(+), 529 deletions(-) mode change 100644 => 100755 egs/multi_en/s5/RESULTS create mode 100755 egs/multi_en/s5/local/hub4_96_data_prep.sh create mode 100755 egs/multi_en/s5/local/hub4_96_parse_sgm.pl create mode 100755 egs/multi_en/s5/local/hub4_97_data_prep.sh create mode 100755 egs/multi_en/s5/local/hub4_97_parse_sgm.pl delete mode 100755 egs/multi_en/s5/local/hub4_data_prep.py create mode 100755 egs/multi_en/s5/local/hub4_en_data_prep.sh create mode 100755 egs/multi_en/s5/local/hub4_format_data.pl create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl delete mode 100644 egs/multi_en/s5/local/hub4_utils.py diff --git a/egs/multi_en/s5/README.md b/egs/multi_en/s5/README.md index 0affcb9cf08..20505c5af6f 100755 --- a/egs/multi_en/s5/README.md +++ b/egs/multi_en/s5/README.md @@ -2,7 +2,7 @@ This is a WIP **English LVCSR recipe** that trains on data from multiple corpora * Fisher (1761 hours) * Switchboard (317 hours) * WSJ (81 hours) -* HUB4 English Broadcast News (76 hours) +* HUB4 (1996 & 1997) English Broadcast News (75 + 72 hours) * TED-LIUM (118 hours) * Librispeech (960 hours) diff --git a/egs/multi_en/s5/RESULTS b/egs/multi_en/s5/RESULTS old mode 100644 new mode 100755 index 24b82755b94..17eb49c3740 --- a/egs/multi_en/s5/RESULTS +++ b/egs/multi_en/s5/RESULTS @@ -36,21 +36,24 @@ exit 0 # multi_a tri5 tedlium_tg_tedlium.si || %WER 29.0 | 1155 27512 | 75.8 20.3 3.9 4.8 29.0 93.3 | exp/multi_a/tri5/decode_tedlium_tg_tedlium.si/score_11_0.5/test.ctm.filt.sys # Results with the current data combination, lexicon preparation, and acoustic model training procedures. -# On eval2000 the final GMM results is 24.3, which is better than the above result (24.9). - -multi_a tri1b tg_eval2000 || %WER 40.3 | 4459 42989 | 63.7 26.1 10.2 4.0 40.3 72.9 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys -multi_a tri1b tg_eval2000.si || %WER 45.3 | 4459 42989 | 59.2 29.3 11.4 4.6 45.3 75.4 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri3a tg_eval2000 || %WER 33.3 | 4459 42989 | 70.4 21.0 8.6 3.7 33.3 69.6 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_1.0/eval2000.ctm.filt.sys -multi_a tri3a tg_eval2000.si || %WER 38.5 | 4459 42989 | 65.9 24.7 9.5 4.4 38.5 72.5 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys -multi_a tri3b tg_eval2000 || %WER 27.9 | 4459 42989 | 75.8 17.9 6.3 3.7 27.9 67.1 | exp/multi_a/tri3b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri3b tg_eval2000.si || %WER 31.6 | 4459 42989 | 71.9 20.3 7.8 3.5 31.6 68.8 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys -multi_a tri3b tg_sp_eval2000 || %WER 26.7 | 4459 42989 | 77.2 17.1 5.7 3.9 26.7 65.6 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri3b tg_sp_eval2000.si || %WER 30.6 | 4459 42989 | 73.1 19.6 7.3 3.8 30.6 68.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_12_1.0/eval2000.ctm.filt.sys -multi_a tri4 tg_eval2000 || %WER 24.8 | 4459 42989 | 78.5 16.0 5.5 3.4 24.8 63.8 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys -multi_a tri4 tg_eval2000.si || %WER 31.2 | 4459 42989 | 72.6 20.6 6.8 3.9 31.2 67.6 | exp/multi_a/tri4/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys -multi_a tri5a tg_eval2000 || %WER 24.3 | 4459 42989 | 78.8 15.6 5.6 3.2 24.3 63.3 | exp/multi_a/tri5a/decode_tg_eval2000/score_13_0.0/eval2000.ctm.filt.sys -multi_a tri5a tg_eval2000.si || %WER 30.6 | 4459 42989 | 73.7 20.2 6.1 4.3 30.6 67.9 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys -multi_a tri5a tg_sp_eval2000 || %WER 24.2 | 4459 42989 | 79.1 15.6 5.3 3.3 24.2 63.2 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.0/eval2000.ctm.filt.sys -multi_a tri5a tg_sp_eval2000.si || %WER 30.5 | 4459 42989 | 73.7 20.3 6.0 4.2 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys -multi_a tri5b tg_eval2000 || %WER 24.3 | 4459 42989 | 79.3 15.7 5.0 3.6 24.3 63.5 | exp/multi_a/tri5b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri5b tg_eval2000.si || %WER 30.7 | 4459 42989 | 73.6 20.4 6.0 4.3 30.7 68.1 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys +# On eval2000 the final GMM results is 24.5, which is better than the above result (24.9). +multi_a tri1b tg_eval2000 || %WER 40.4 | 4459 42989 | 63.8 25.9 10.3 4.2 40.4 72.7 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys +multi_a tri1b tg_eval2000.si || %WER 45.0 | 4459 42989 | 59.3 28.8 11.9 4.3 45.0 75.0 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri3a tg_eval2000 || %WER 33.4 | 4459 42989 | 70.5 21.3 8.3 3.9 33.4 69.7 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys +multi_a tri3a tg_eval2000.si || %WER 38.4 | 4459 42989 | 66.2 24.2 9.6 4.6 38.4 72.3 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys +multi_a tri3b tg_eval2000 || %WER 27.8 | 4459 42989 | 75.7 17.8 6.6 3.5 27.8 66.6 | exp/multi_a/tri3b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri3b tg_eval2000.si || %WER 31.7 | 4459 42989 | 71.8 20.3 7.8 3.6 31.7 69.0 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys +multi_a tri3b tg_sp_eval2000 || %WER 26.8 | 4459 42989 | 77.0 17.3 5.7 3.8 26.8 65.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_1.0/eval2000.ctm.filt.sys +multi_a tri3b tg_sp_eval2000.si || %WER 30.5 | 4459 42989 | 73.7 19.7 6.7 4.2 30.5 68.0 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys +multi_a tri4 tg_eval2000 || %WER 24.8 | 4459 42989 | 78.6 15.8 5.5 3.5 24.8 64.1 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys +multi_a tri4 tg_eval2000.si || %WER 31.3 | 4459 42989 | 73.1 20.8 6.2 4.4 31.3 68.7 | exp/multi_a/tri4/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys +multi_a tri5a tg_eval2000 || %WER 24.5 | 4459 42989 | 79.0 15.7 5.3 3.5 24.5 63.4 | exp/multi_a/tri5a/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri5a tg_eval2000.si || %WER 30.4 | 4459 42989 | 73.3 20.0 6.6 3.8 30.4 67.5 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys +multi_a tri5a tg_sp_eval2000 || %WER 24.5 | 4459 42989 | 78.9 15.7 5.4 3.4 24.5 63.4 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.5/eval2000.ctm.filt.sys +multi_a tri5a tg_sp_eval2000.si || %WER 30.5 | 4459 42989 | 73.5 20.1 6.5 4.0 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys +multi_a tri5b tg_eval2000 || %WER 24.4 | 4459 42989 | 79.1 15.6 5.3 3.5 24.4 63.4 | exp/multi_a/tri5b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri5b tg_eval2000.si || %WER 30.5 | 4459 42989 | 73.5 20.2 6.3 4.0 30.5 67.3 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys +multi_a tri6a tg_eval2000 || %WER 24.5 | 4459 42989 | 78.8 15.7 5.5 3.4 24.5 63.0 | exp/multi_a/tri6a/decode_tg_eval2000/score_13_0.5/eval2000.ctm.filt.sys +multi_a tri6a tg_eval2000.si || %WER 31.5 | 4459 42989 | 73.1 21.0 5.9 4.6 31.5 68.1 | exp/multi_a/tri6a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys +multi_a tri6a tg_sp_eval2000 || %WER 24.6 | 4459 42989 | 78.9 15.8 5.3 3.5 24.6 63.3 | exp/multi_a/tri6a/decode_tg_sp_eval2000/score_12_1.0/eval2000.ctm.filt.sys +multi_a tri6a tg_sp_eval2000.si || %WER 31.5 | 4459 42989 | 72.6 21.0 6.4 4.2 31.5 67.9 | exp/multi_a/tri6a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys diff --git a/egs/multi_en/s5/conf/mfcc.conf b/egs/multi_en/s5/conf/mfcc.conf index 4f780bf520c..9a17e801b3f 100644 --- a/egs/multi_en/s5/conf/mfcc.conf +++ b/egs/multi_en/s5/conf/mfcc.conf @@ -2,3 +2,4 @@ --sample-frequency=8000 --low-freq=20 --high-freq=3700 +--allow-downsample=true diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh index 88b37f21ad8..f8e50302c29 100755 --- a/egs/multi_en/s5/local/g2p/apply_g2p.sh +++ b/egs/multi_en/s5/local/g2p/apply_g2p.sh @@ -33,7 +33,7 @@ cat data/*/train/text | \ perl -ape 's/\s/\n/g;' | \ sort | uniq > $workdir/missing.txt cat $workdir/missing.txt | \ - grep "^[a-z0-9.'_-]*$" > $workdir/missing_onlywords.txt + grep "^[a-z]*$" > $workdir/missing_onlywords.txt echo 'Synthesizing pronunciations for missing words...' phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt diff --git a/egs/multi_en/s5/local/hub4_96_data_prep.sh b/egs/multi_en/s5/local/hub4_96_data_prep.sh new file mode 100755 index 00000000000..f258ea7b7f5 --- /dev/null +++ b/egs/multi_en/s5/local/hub4_96_data_prep.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +########################################################################################### +# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh +# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da +# Changes in lower level script/dir names were made +########################################################################################### + +#!/bin/bash +# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal) +# 2017 Vimal Manohar +# License: Apache 2.0 + +# This script prepares the 1996 English Broadcast News (HUB4) corpus. +# /export/corpora/LDC/LDC97S44 +# /export/corpora/LDC/LDC97T22 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96" + exit 1 +fi + +text_source_dir=$1 # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans +speech_source_dir=$2 # /export/corpora/LDC/LDC97S44/data +out=$3 + +mkdir -p $out; + +ls $text_source_dir/*/*.txt > $out/text.list +ls $speech_source_dir/*.sph > $out/audio.list + +if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then + echo "$0: Could not get text and audio files" + exit 1 +fi + +local/hub4_96_parse_sgm.pl $out/text.list > \ + $out/transcripts.txt 2> $out/parse_sgml.log || exit 1 + +if [ ! -s $out/transcripts.txt ]; then + echo "$0: Could not parse SGML files in $out/text.list" + exit 1 +fi + +echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out" +exit 0 diff --git a/egs/multi_en/s5/local/hub4_96_parse_sgm.pl b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl new file mode 100755 index 00000000000..172ec5bb563 --- /dev/null +++ b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl @@ -0,0 +1,235 @@ +#!/usr/bin/env perl +########################################################################################### +# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl +# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203 +# No change was made +########################################################################################### + +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ / 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = $new_time; + ; + } elsif ($line =~ /<\/sync/) { + #print $line; + ; + } elsif ($line =~ /) +# 2017 Vimal Manohar +# License: Apache 2.0 + +# This script prepares the 1997 English Broadcast News (HUB4) corpus. +# /export/corpora/LDC/LDC98S71 +# /export/corpora/LDC/LDC98T28 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97" + exit 1 +fi + +text_source_dir=$1 # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 +speech_source_dir=$2 # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 +out=$3 + +mkdir -p $out; + +ls $text_source_dir/transcrp/*.sgml > $out/text.list +ls $speech_source_dir/*.sph > $out/audio.list + +if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then + echo "$0: Could not get text and audio files" + exit 1 +fi + +local/hub4_97_parse_sgm.pl $out/text.list > \ + $out/transcripts.txt 2> $out/parse_sgml.log || exit 1 + +if [ ! -s $out/transcripts.txt ]; then + echo "$0: Could not parse SGML files in $out/text.list" + exit 1 +fi + +echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out" +exit 0 diff --git a/egs/multi_en/s5/local/hub4_97_parse_sgm.pl b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl new file mode 100755 index 00000000000..da2344df7c7 --- /dev/null +++ b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl @@ -0,0 +1,235 @@ +#!/usr/bin/env perl +########################################################################################### +# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl +# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da +# No change was made +########################################################################################### + +#!/usr/bin/env perl +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ /