diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index 0677e1ca474..a205490ee3f 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -119,14 +119,14 @@ void OnlineNaturalGradient::InitDefault(int32 D) { t_ = 0; } -void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { - int32 D = R0.NumCols(); +void OnlineNaturalGradient::Init(const CuMatrixBase &X0) { + int32 D = X0.NumCols(); // for locking reasons it's better to use a different object. OnlineNaturalGradient this_copy(*this); this_copy.InitDefault(D); this_copy.t_ = 1; // Prevent recursion to Init() again. - CuMatrix R0_copy(R0.NumRows(), R0.NumCols(), kUndefined); + CuMatrix X0_copy(X0.NumRows(), X0.NumCols(), kUndefined); // 'num_iters' is number of iterations with the same data from a pseudorandom // start. this is a faster way of starting than doing eigenvalue // decomposition. @@ -134,11 +134,11 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { // Note: we only do three iterations of initialization if we have enough data // that it's reasonably possible to estimate the subspace of dimension // this_copy.rank_. If we don't have more than that many rows in our initial - // minibatch R0, we just do one iteration... this gives us almost exactly - // (barring small effects due to epsilon_ > 0) the row subspace of R0 after + // minibatch X0, we just do one iteration... this gives us almost exactly + // (barring small effects due to epsilon_ > 0) the row subspace of X0 after // one iteration anyway. int32 num_init_iters; - if (R0.NumRows() <= this_copy.rank_) + if (X0.NumRows() <= this_copy.rank_) num_init_iters = 1; else num_init_iters = 3; @@ -147,8 +147,8 @@ void OnlineNaturalGradient::Init(const CuMatrixBase &R0) { // initialize. for (int32 i = 0; i < num_init_iters; i++) { BaseFloat scale; - R0_copy.CopyFromMat(R0); - this_copy.PreconditionDirections(&R0_copy, &scale); + X0_copy.CopyFromMat(X0); + this_copy.PreconditionDirections(&X0_copy, &scale); } rank_ = this_copy.rank_; W_t_.Swap(&this_copy.W_t_); @@ -197,7 +197,7 @@ void OnlineNaturalGradient::PreconditionDirections( t_ += 1; } -void OnlineNaturalGradient::ReorthogonalizeXt1( +void OnlineNaturalGradient::ReorthogonalizeRt1( const VectorBase &d_t1, BaseFloat rho_t1, CuMatrixBase *W_t1, @@ -214,7 +214,7 @@ void OnlineNaturalGradient::ReorthogonalizeXt1( ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1); temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0); - // O_t = E_t^{-0.5} W_t W_t^T E_t^{-0.5} + // O_{t+1} = E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5} Matrix O_mat(*temp_O); SpMatrix O(O_mat, kTakeLower); for (int32 i = 0; i < R; i++) { @@ -439,7 +439,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal( if (self_debug_) { KALDI_WARN << "Reorthogonalizing."; } - ReorthogonalizeXt1(d_t1, + ReorthogonalizeRt1(d_t1, rho_t1, &W_t1, &J_t, @@ -510,7 +510,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N, // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0); - // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t + // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} Matrix A_t(U_t, kTrans); for (int32 i = 0; i < R; i++) { BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i); diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index a68ad9bbb53..77be28a19d4 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -375,8 +375,8 @@ namespace nnet3 { * Initialization * Now, a note on what we do on time t = 0, i.e. for the first minibatch. We - initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the - minibatch size (num-rows of R0). If L is the corresponding RxR diagonal + initialize R_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the + minibatch size (num-rows of X0). If L is the corresponding RxR diagonal matrix of eigenvalues, then we will set D_0 = L - \rho_0 I. We set \rho_0 to ensure that tr(F_0) = 1/N tr(X_0 X_0^T), @@ -457,7 +457,7 @@ class OnlineNaturalGradient { not. */ - void PreconditionDirections(CuMatrixBase *R, + void PreconditionDirections(CuMatrixBase *X, BaseFloat *scale); @@ -515,7 +515,7 @@ class OnlineNaturalGradient { // This function is called if C_t has high condition number; it makes sure // that R_{t+1} is orthogonal. See the section in the extended comment above // on "keeping R_t orthogonal". - void ReorthogonalizeXt1(const VectorBase &d_t1, + void ReorthogonalizeRt1(const VectorBase &d_t1, BaseFloat rho_t1, CuMatrixBase *W_t1, CuMatrixBase *temp_W,