Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions src/nnet3/natural-gradient-online.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,26 +119,26 @@ void OnlineNaturalGradient::InitDefault(int32 D) {
t_ = 0;
}

void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
int32 D = R0.NumCols();
void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &X0) {
int32 D = X0.NumCols();
// for locking reasons it's better to use a different object.
OnlineNaturalGradient this_copy(*this);
this_copy.InitDefault(D);
this_copy.t_ = 1; // Prevent recursion to Init() again.

CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
CuMatrix<BaseFloat> X0_copy(X0.NumRows(), X0.NumCols(), kUndefined);
// 'num_iters' is number of iterations with the same data from a pseudorandom
// start. this is a faster way of starting than doing eigenvalue
// decomposition.
//
// Note: we only do three iterations of initialization if we have enough data
// that it's reasonably possible to estimate the subspace of dimension
// this_copy.rank_. If we don't have more than that many rows in our initial
// minibatch R0, we just do one iteration... this gives us almost exactly
// (barring small effects due to epsilon_ > 0) the row subspace of R0 after
// minibatch X0, we just do one iteration... this gives us almost exactly
// (barring small effects due to epsilon_ > 0) the row subspace of X0 after
// one iteration anyway.
int32 num_init_iters;
if (R0.NumRows() <= this_copy.rank_)
if (X0.NumRows() <= this_copy.rank_)
num_init_iters = 1;
else
num_init_iters = 3;
Expand All @@ -147,8 +147,8 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
// initialize.
for (int32 i = 0; i < num_init_iters; i++) {
BaseFloat scale;
R0_copy.CopyFromMat(R0);
this_copy.PreconditionDirections(&R0_copy, &scale);
X0_copy.CopyFromMat(X0);
this_copy.PreconditionDirections(&X0_copy, &scale);
}
rank_ = this_copy.rank_;
W_t_.Swap(&this_copy.W_t_);
Expand Down Expand Up @@ -197,7 +197,7 @@ void OnlineNaturalGradient::PreconditionDirections(
t_ += 1;
}

void OnlineNaturalGradient::ReorthogonalizeXt1(
void OnlineNaturalGradient::ReorthogonalizeRt1(
const VectorBase<BaseFloat> &d_t1,
BaseFloat rho_t1,
CuMatrixBase<BaseFloat> *W_t1,
Expand All @@ -214,7 +214,7 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);

temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
// O_t = E_t^{-0.5} W_t W_t^T E_t^{-0.5}
// O_{t+1} = E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5}
Matrix<BaseFloat> O_mat(*temp_O);
SpMatrix<BaseFloat> O(O_mat, kTakeLower);
for (int32 i = 0; i < R; i++) {
Expand Down Expand Up @@ -439,7 +439,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
if (self_debug_) {
KALDI_WARN << "Reorthogonalizing.";
}
ReorthogonalizeXt1(d_t1,
ReorthogonalizeRt1(d_t1,
rho_t1,
&W_t1,
&J_t,
Expand Down Expand Up @@ -510,7 +510,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
// B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);

// A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
// A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
Matrix<BaseFloat> A_t(U_t, kTrans);
for (int32 i = 0; i < R; i++) {
BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
Expand Down
8 changes: 4 additions & 4 deletions src/nnet3/natural-gradient-online.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ namespace nnet3 {
* Initialization *

Now, a note on what we do on time t = 0, i.e. for the first minibatch. We
initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
minibatch size (num-rows of R0). If L is the corresponding RxR diagonal
initialize R_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
minibatch size (num-rows of X0). If L is the corresponding RxR diagonal
matrix of eigenvalues, then we will set D_0 = L - \rho_0 I. We set \rho_0
to ensure that
tr(F_0) = 1/N tr(X_0 X_0^T),
Expand Down Expand Up @@ -457,7 +457,7 @@ class OnlineNaturalGradient {
not.

*/
void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
void PreconditionDirections(CuMatrixBase<BaseFloat> *X,
BaseFloat *scale);


Expand Down Expand Up @@ -515,7 +515,7 @@ class OnlineNaturalGradient {
// This function is called if C_t has high condition number; it makes sure
// that R_{t+1} is orthogonal. See the section in the extended comment above
// on "keeping R_t orthogonal".
void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
void ReorthogonalizeRt1(const VectorBase<BaseFloat> &d_t1,
BaseFloat rho_t1,
CuMatrixBase<BaseFloat> *W_t1,
CuMatrixBase<BaseFloat> *temp_W,
Expand Down