diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py index 4a39ed9dae6..144bc879e51 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py @@ -76,12 +76,10 @@ def prepare_initial_acoustic_model(dir, alidir, run_opts, common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) - # Convert to .mdl, train the transitions, set the priors. + # Convert to .mdl, set the priors. common_lib.execute_command( """{command} {dir}/log/init_mdl.log \ - nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \ - nnet3-am-train-transitions - \ - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl + nnet3-am-init {alidir}/final.mdl {raw_mdl} {dir}/0.mdl """.format(command=run_opts.command, dir=dir, alidir=alidir, raw_mdl=(input_model if input_model is not None diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index e474e85378a..a39362b0af0 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -31,7 +31,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \ nnet-compile-looped.o decodable-simple-looped.o \ decodable-online-looped.o convolution.o \ nnet-convolutional-component.o attention.o \ - nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o + nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o \ + get-feature-transform.o LIBNAME = kaldi-nnet3 @@ -41,6 +42,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \ ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../cblasext/kaldi-cblasext.a \ - ../base/kaldi-base.a + ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/nnet3/get-feature-transform.cc b/src/nnet3/get-feature-transform.cc new file mode 100644 index 00000000000..3eef63765fe --- /dev/null +++ b/src/nnet3/get-feature-transform.cc @@ -0,0 +1,203 @@ +// nnet3/get-feature-transform.cc + +// Copyright 2009-2011 Jan Silovsky +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "nnet3/get-feature-transform.h" + +namespace kaldi { + + + +void FeatureTransformEstimate::Estimate(const FeatureTransformEstimateOptions &opts, + Matrix *M, + TpMatrix *C) const { + double count; + Vector total_mean; + SpMatrix total_covar, between_covar; + GetStats(&total_covar, &between_covar, &total_mean, &count); + KALDI_LOG << "Data count is " << count; + EstimateInternal(opts, total_covar, between_covar, total_mean, M, C); +} + +// static +void FeatureTransformEstimate::EstimateInternal( + const FeatureTransformEstimateOptions &opts, + const SpMatrix &total_covar, + const SpMatrix &between_covar, + const Vector &total_mean, + Matrix *M, + TpMatrix *C) { + + int32 target_dim = opts.dim, dim = total_covar.NumRows(); + // Interpret zero or negative target_dim as the full dim + if (target_dim <= 0) + target_dim = dim; + // between-class covar is of most rank C-1 + KALDI_ASSERT(target_dim <= dim); + + // within-class covariance + SpMatrix wc_covar(total_covar); + wc_covar.AddSp(-1.0, between_covar); + TpMatrix wc_covar_sqrt(dim); + try { + wc_covar_sqrt.Cholesky(wc_covar); + if (C != NULL) { + C->Resize(dim); + C->CopyFromTp(wc_covar_sqrt); + } + } catch (...) { + BaseFloat smooth = 1.0e-03 * wc_covar.Trace() / wc_covar.NumRows(); + KALDI_LOG << "Cholesky failed (possibly not +ve definite), so adding " << smooth + << " to diagonal and trying again.\n"; + for (int32 i = 0; i < wc_covar.NumRows(); i++) + wc_covar(i, i) += smooth; + wc_covar_sqrt.Cholesky(wc_covar); + } + Matrix wc_covar_sqrt_mat(wc_covar_sqrt); + wc_covar_sqrt_mat.Invert(); + + SpMatrix tmp_sp(dim); + tmp_sp.AddMat2Sp(1.0, wc_covar_sqrt_mat, kNoTrans, between_covar, 0.0); + Matrix tmp_mat(tmp_sp); + Matrix svd_u(dim, dim), svd_vt(dim, dim); + Vector svd_d(dim); + tmp_mat.Svd(&svd_d, &svd_u, &svd_vt); + SortSvd(&svd_d, &svd_u); + + KALDI_LOG << "LDA singular values are " << svd_d; + + KALDI_LOG << "Sum of all singular values is " << svd_d.Sum(); + KALDI_LOG << "Sum of selected singular values is " << + SubVector(svd_d, 0, target_dim).Sum(); + + Matrix lda_mat(dim, dim); + lda_mat.AddMatMat(1.0, svd_u, kTrans, wc_covar_sqrt_mat, kNoTrans, 0.0); + + // finally, copy first target_dim rows to m + M->Resize(target_dim, dim); + M->CopyFromMat(lda_mat.Range(0, target_dim, 0, dim)); + + if (opts.within_class_factor != 1.0) { + for (int32 i = 0; i < svd_d.Dim(); i++) { + BaseFloat old_var = 1.0 + svd_d(i), // the total variance of that dim.. + new_var = opts.within_class_factor + svd_d(i), // the variance we want.. + scale = sqrt(new_var / old_var); + if (i < M->NumRows()) + M->Row(i).Scale(scale); + } + } + + if (opts.max_singular_value > 0.0) { + int32 rows = M->NumRows(), cols = M->NumCols(), + min_dim = std::min(rows, cols); + Matrix U(rows, min_dim), Vt(min_dim, cols); + Vector s(min_dim); + M->Svd(&s, &U, &Vt); // decompose m = U diag(s) Vt. + BaseFloat max_s = s.Max(); + int32 n; + s.ApplyCeiling(opts.max_singular_value, &n); + if (n > 0) { + KALDI_LOG << "Applied ceiling to " << n << " out of " << s.Dim() + << " singular values of transform using ceiling " + << opts.max_singular_value << ", max is " << max_s; + Vt.MulRowsVec(s); + // reconstruct m with the modified singular values: + M->AddMatMat(1.0, U, kNoTrans, Vt, kNoTrans, 0.0); + } + } + + if (opts.remove_offset) + AddMeanOffset(total_mean, M); +} + +void FeatureTransformEstimateMulti::EstimateTransformPart( + const FeatureTransformEstimateOptions &opts, + const std::vector &indexes, + const SpMatrix &total_covar, + const SpMatrix &between_covar, + const Vector &mean, + Matrix *M) const { + + int32 full_dim = Dim(), proj_dim = indexes.size(); + Matrix transform(proj_dim, full_dim); // projects from full to projected dim. + for (int32 i = 0; i < proj_dim; i++) + transform(i, indexes[i]) = 1.0; + + SpMatrix total_covar_proj(proj_dim), between_covar_proj(proj_dim); + Vector mean_proj(proj_dim); + total_covar_proj.AddMat2Sp(1.0, transform, kNoTrans, total_covar, 0.0); + between_covar_proj.AddMat2Sp(1.0, transform, kNoTrans, between_covar, 0.0); + mean_proj.AddMatVec(1.0, transform, kNoTrans, mean, 0.0); + + Matrix M_proj; + FeatureTransformEstimateOptions opts_tmp(opts); + opts_tmp.dim = proj_dim; + EstimateInternal(opts_tmp, total_covar_proj, between_covar_proj, mean_proj, + &M_proj, NULL); + if (M_proj.NumCols() == proj_dim + 1) { // Extend transform to add the extra "1" that we + // use to handle mean shifts.. + transform.Resize(proj_dim + 1, full_dim + 1, kCopyData); + transform(proj_dim, full_dim) = 1.0; + } + M->Resize(proj_dim, transform.NumCols()); + // Produce output.. + M->AddMatMat(1.0, M_proj, kNoTrans, Matrix(transform), + kNoTrans, 0.0); +} + +void FeatureTransformEstimateMulti::Estimate( + const FeatureTransformEstimateOptions &opts, + const std::vector > &indexes, + Matrix *M) const { + + int32 input_dim = Dim(), output_dim = 0, num_transforms = indexes.size(); + for (int32 i = 0; i < num_transforms; i++) { // some input-checking. + KALDI_ASSERT(indexes[i].size() > 0); + std::vector this_indexes(indexes[i]); + std::sort(this_indexes.begin(), this_indexes.end()); + KALDI_ASSERT(IsSortedAndUniq(this_indexes)); // check for duplicates. + KALDI_ASSERT(this_indexes.front() >= 0); + KALDI_ASSERT(this_indexes.back() < input_dim); + output_dim += this_indexes.size(); + } + + int32 input_dim_ext = (opts.remove_offset ? input_dim + 1 : input_dim); + M->Resize(output_dim, input_dim_ext); + + double count; + Vector total_mean; + SpMatrix total_covar, between_covar; + GetStats(&total_covar, &between_covar, &total_mean, &count); + + int32 cur_output_index = 0; + for (int32 i = 0; i < num_transforms; i++) { + Matrix M_tmp; + EstimateTransformPart(opts, indexes[i], total_covar, between_covar, + total_mean, &M_tmp); + int32 this_output_dim = indexes[i].size(); + M->Range(cur_output_index, this_output_dim, 0, M->NumCols()). + CopyFromMat(M_tmp); + cur_output_index += this_output_dim; + } + +} + + +} // End of namespace kaldi diff --git a/src/nnet3/get-feature-transform.h b/src/nnet3/get-feature-transform.h new file mode 100644 index 00000000000..d3a52d55552 --- /dev/null +++ b/src/nnet3/get-feature-transform.h @@ -0,0 +1,179 @@ +// nnet3/get-feature-transform.h + +// Copyright 2009-2011 Jan Silovsky +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_GET_FEATURE_TRANSFORM_H_ +#define KALDI_NNET3_GET_FEATURE_TRANSFORM_H_ + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/matrix-lib.h" +#include "transform/lda-estimate.h" + +namespace kaldi { + +/** + @file + This file is modified from transform/lda-estimate.h + It contains a class intended to be used in preconditioning + data for neural network training. See the documentation for class + FeatureTransformEstimate for more details. +*/ + +struct FeatureTransformEstimateOptions { + bool remove_offset; + int32 dim; + BaseFloat within_class_factor; + BaseFloat max_singular_value; + FeatureTransformEstimateOptions(): remove_offset(true), dim(-1), + within_class_factor(0.001), max_singular_value(5.0) { } + + void Register(OptionsItf *opts) { + opts->Register("remove-offset", &remove_offset, "If true, output an affine " + "transform that makes the projected data mean equal to zero."); + opts->Register("dim", &dim, "Dimension to project to with LDA"); + opts->Register("within-class-factor", &within_class_factor, "If 1.0, do " + "conventional LDA where the within-class variance will be " + "unit in the projected space. May be set to less than 1.0, " + "which scales the features to have less variance, particularly " + "for dimensions where between-class variance is small. "); + opts->Register("max-singular-value", &max_singular_value, "If >0, maximum " + "allowed singular value of final transform (they are floored " + "to this)"); + } +}; + +/** + Class for computing a feature transform used for preconditioning of the + training data in neural-networks. + + By preconditioning here, all we really mean is an affine transform of the + input data-- say if we set up the classification as going from vectors x_i + to labels y_i, then this would be a linear transform on X, so we replace + x_i with x'_i = A x_i + b. The statistics we use to obtain this transform + are the within-class and between class variance statistics, and the global + data mean, that we would use to estimate LDA. When designing this, we had + a few principles in mind: + - We want to remove the global mean of the input features (this is + well established, I think there is a paper by LeCun explaining why + this is a good thing). + - We would like the transform to make the training process roughly + invariant to linear transformations of the input features, meaning + that whatever linear transformation you apply prior to this transform, + it should 'undo' it. + - We want directions in which there is a lot of between-class variance + to be given a higher variance than directions that have mostly + within-class variance-- it has been our experience that these + 'nuisance directions' will interfere with the training if they are + given too large a scaling. + It is essential to our method that the number of classes is higher than + the dimension of the input feature space, which is normal for speech + recognition tasks (~5000 > ~250). + + Basically our method is as follows: + + - First subtract the mean. + - Get the within-class and between-class stats, as for LDA. + - Normalize the space as for LDA, so that the within-class covariance + matrix is unit and the between-class covariance matrix is diagonalized + - At this stage, if the user asked for dimension reduction then + reduce the dimension by taking out dimensions with least between-class + variance [note: the current scripts do not do this by default] + - Apply a transform that reduces the variance of dimensions + with low between-class variance, as we'll describe below. + - Finally, do an SVD of the resulting transform, A = U S V^T, apply a + maximum to the diagonal elements of the matrix S (e.g. 5.0), and + reconstruct A' = U S' V^T; this is the final transform. The point of + this stage is to stop the transform from 'blowing up' any dimensions of + the space excessively; this stage was introduced in response to a + problem we encountered at one point, and I think normally not very many + dimensions of S end up getting floored. + + We need to explain the step that applies the dimension-specific scaling, + which we described above as, "Apply a transform that reduces the variance + of dimensions with low between-class variance". For a particular + dimension, let the between-class diagonal covariance element be \lambda_i, + and the within-class diagonal covariance is 1 at this point (since we + have normalized the within-class covariance to unity); hence, the total + variance is \lambda_i + 1. + Below, "within-class-factor" is a constant that we set by default to + 0.001. We scale the i'th dimension of the features by: + + \f$ sqrt( (within-class-factor + \lambda_i) / (1 + \lambda_i) ) \f$ + + If \lambda_i >> 1, this scaling factor approaches 1 (we don't need to + scale up dimensions with high between-class variance as they already + naturally have a higher variance than other dimensions. As \lambda_i + becomes small, this scaling factor approaches sqrt(within-class-factor), + so dimensions with very small between-class variance get assigned a small + variance equal to within-class-factor, and for dimensions with + intermediate between-class variance, they end up with a variance roughly + equal to \lambda_i: consider that the variance was originally (1 + + \lambda_i), so by scaling the features by approximately sqrt((\lambda_i) / + (1 + \lambda_i)), the variance becomes approximately \lambda_i [this is + clear after noting that the variance gets scaled by the square of the + feature scale]. + */ +class FeatureTransformEstimate: public LdaEstimate { + public: + /// Estimates the LDA transform matrix m. If Mfull != NULL, it also outputs + /// the full matrix (without dimensionality reduction), which is useful for + /// some purposes. If opts.remove_offset == true, it will output both matrices + /// with an extra column which corresponds to mean-offset removal (the matrix + /// should be multiplied by the feature with a 1 appended to give the correct + /// result, as with other Kaldi transforms.) + /// "within_cholesky" is a pointer to an SpMatrix that, if non-NULL, will + /// be set to the Cholesky factor of the within-class covariance matrix. + /// This is used for perturbing features. + void Estimate(const FeatureTransformEstimateOptions &opts, + Matrix *M, + TpMatrix *within_cholesky) const; + protected: + static void EstimateInternal(const FeatureTransformEstimateOptions &opts, + const SpMatrix &total_covar, + const SpMatrix &between_covar, + const Vector &mean, + Matrix *M, + TpMatrix *C); +}; + + +class FeatureTransformEstimateMulti: public FeatureTransformEstimate { + public: + /// This is as FeatureTransformEstimate, but for use in + /// nnet-get-feature-transform-multi.cc, see the usage message + /// of that program for a description of what it does. + void Estimate(const FeatureTransformEstimateOptions &opts, + const std::vector > &indexes, + Matrix *M) const; + + private: + void EstimateTransformPart(const FeatureTransformEstimateOptions &opts, + const std::vector &indexes, + const SpMatrix &total_covar, + const SpMatrix &between_covar, + const Vector &mean, + Matrix *M) const; +}; + + + +} // End namespace kaldi + +#endif // KALDI_NNET3_GET_FEATURE_TRANSFORM_H_ diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 74d85efce1c..7212e480dbc 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -20,7 +20,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \ nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \ nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \ - cuda-compiled + nnet-get-feature-transform cuda-compiled OBJFILES = diff --git a/src/nnet3bin/cuda-compiled.cc b/src/nnet3bin/cuda-compiled.cc index b6de9257657..50a36450412 100644 --- a/src/nnet3bin/cuda-compiled.cc +++ b/src/nnet3bin/cuda-compiled.cc @@ -1,4 +1,4 @@ -// nnet2bin/cuda-compiled.cc +// nnet3bin/cuda-compiled.cc // Copyright 2014 Johns Hopkins University (author: Daniel Povey) diff --git a/src/nnet3bin/nnet-get-feature-transform.cc b/src/nnet3bin/nnet-get-feature-transform.cc new file mode 100644 index 00000000000..43bbaacbe94 --- /dev/null +++ b/src/nnet3bin/nnet-get-feature-transform.cc @@ -0,0 +1,85 @@ +// nnet3bin/nnet-get-feature-transform.cc + +// Copyright 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/get-feature-transform.h" + +int main(int argc, char *argv[]) { + using namespace kaldi; + typedef kaldi::int32 int32; + try { + const char *usage = + "Get feature-projection transform using stats obtained with acc-lda.\n" + "See comments in the code of nnet2/get-feature-transform.h for more\n" + "information.\n" + "\n" + "Usage: nnet-get-feature-transform [options] ...\n"; + + bool binary = true; + FeatureTransformEstimateOptions opts; + std::string write_cholesky; + std::string write_within_covar; + ParseOptions po(usage); + po.Register("binary", &binary, "Write outputs in binary mode."); + po.Register("write-cholesky", &write_cholesky, "If supplied, write to this " + "wxfilename the Cholesky factor of the within-class covariance. " + "Can be used for perturbing features. E.g. " + "--write-cholesky=exp/nnet5/cholesky.tpmat"); + po.Register("write-within-covar", &write_within_covar, "If supplied, write " + "to this wxfilename the within-class covariance (as a symmetric " + "matrix). E.g. --write-within-covar=exp/nnet5/within_covar.mat"); + opts.Register(&po); + po.Read(argc, argv); + + if (po.NumArgs() < 2) { + po.PrintUsage(); + exit(1); + } + + FeatureTransformEstimate fte; + std::string projection_wxfilename = po.GetArg(1); + + for (int32 i = 2; i <= po.NumArgs(); i++) { + bool binary_in, add = true; + Input ki(po.GetArg(i), &binary_in); + fte.Read(ki.Stream(), binary_in, add); + } + + Matrix mat; + TpMatrix cholesky; + fte.Estimate(opts, &mat, + (write_cholesky != "" || write_within_covar != "" ? + &cholesky : NULL)); + WriteKaldiObject(mat, projection_wxfilename, binary); + if (write_cholesky != "") { + WriteKaldiObject(cholesky, write_cholesky, binary); + } + if (write_within_covar != "") { + SpMatrix within_var(cholesky.NumRows()); + within_var.AddTp2(1.0, cholesky, kNoTrans, 0.0); + WriteKaldiObject(within_var, write_within_covar, binary); + } + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}