-
Notifications
You must be signed in to change notification settings - Fork 14
Xvector #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Xvector #5
Changes from all commits
2fa92eb
025150a
0373c07
adf6a94
6e67582
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
|
|
||
| // Copyright 2009-2012 Karel Vesely | ||
| // Johns Hopkins University (author: Daniel Povey) | ||
| // 2016 David Snyder | ||
|
|
||
| // See ../../COPYING for clarification regarding multiple authors | ||
| // | ||
|
|
@@ -29,15 +30,15 @@ namespace kaldi { | |
| namespace cu { | ||
|
|
||
| /* | ||
| * templated functions wrapping the ANSI-C CUDA kernel functions | ||
| * templated functions wrapping the ANSI-C CUDA kernel functions | ||
| */ | ||
|
|
||
|
|
||
| template<typename Real> | ||
| void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) { | ||
| KALDI_ASSERT(SameDim(*weight, *grad)); | ||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
|
|
@@ -46,7 +47,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, | |
| cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr, | ||
| weight->Dim(), grad->Stride()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -55,11 +56,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, | |
| MatrixBase<Real> &grad2 = grad->Mat(); | ||
| for(MatrixIndexT r=0; r<weight2.NumRows(); r++) { | ||
| for(MatrixIndexT c=0; c<weight2.NumCols(); c++) { | ||
|
|
||
| if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght! | ||
|
|
||
| Real l1_signed = l1; | ||
| if (weight2(r, c) < 0.0) | ||
| if (weight2(r, c) < 0.0) | ||
| l1_signed = -l1; | ||
|
|
||
| Real before = weight2(r, c); | ||
|
|
@@ -88,16 +89,16 @@ void Randomize(const CuMatrixBase<Real> &src, | |
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| /* | ||
| Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 | ||
| Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 | ||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK)); | ||
| */ | ||
|
|
||
| /* | ||
| * Let's use blocksize 4 x 128 (512 threads/block) | ||
| * and extend the randomizable matrices to: col 4*65535, row 128*65535 | ||
| * and extend the randomizable matrices to: col 4*65535, row 128*65535 | ||
| * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints)) | ||
| */ | ||
| dim3 dimBlock(4, 128); | ||
|
|
@@ -111,7 +112,7 @@ void Randomize(const CuMatrixBase<Real> &src, | |
| cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(), | ||
| copy_from_idx.Data(), dimtgt, dimsrc); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -124,28 +125,28 @@ void Randomize(const CuMatrixBase<Real> &src, | |
| tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i])); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
||
| template<typename Real> | ||
| void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets, | ||
| CuMatrixBase<Real> *tgt) { | ||
|
|
||
| KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols()); | ||
| KALDI_ASSERT(src.NumRows() == tgt->NumRows()); | ||
|
|
||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); | ||
|
|
||
| cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(), | ||
| frame_offsets.Data(), tgt->Dim(), src.Dim()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -171,22 +172,22 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets, | |
|
|
||
| template<typename Real> | ||
| void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> ©_from_indices, | ||
| CuMatrixBase<Real> *tgt) { | ||
| CuMatrixBase<Real> *tgt) { | ||
|
|
||
| KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols()); | ||
| KALDI_ASSERT(src.NumRows() == tgt->NumRows()); | ||
|
|
||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); | ||
|
|
||
| cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(), | ||
| copy_from_indices.Data(), tgt->Dim(), src.Dim()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -205,6 +206,31 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> ©_from_indices | |
| } | ||
| } | ||
|
|
||
| template<typename Real> | ||
| void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores, | ||
| CuMatrixBase<Real> *objf_terms, | ||
| CuMatrixBase<Real> *objf_derivs) { | ||
| #if HAVE_CUDA == 1 | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. check the dimensions at the beginning of this function-- KALDI_ASSERT(SameDim(scores, *objf_terms) && ..) |
||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(scores.NumCols(), CU2DBLOCK), | ||
| n_blocks(scores.NumRows(), CU2DBLOCK)); | ||
|
|
||
| cuda_compute_xvector_objf(dimGrid, dimBlock, scores.Data(), scores.Dim(), | ||
| objf_terms->Data(), objf_terms->Dim(), objf_derivs->Data(), | ||
| objf_derivs->Dim()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
| { | ||
| // TODO: Add the CPU version. | ||
| KALDI_LOG << "NOT USING CUDA"; | ||
| } | ||
| } | ||
|
|
||
| // instantiate the templates. | ||
| template | ||
| void RegularizeL1(CuMatrixBase<float> *weight, CuMatrixBase<float> *grad, float l1, float lr); | ||
|
|
@@ -233,6 +259,15 @@ void Randomize(const CuMatrixBase<double> &src, | |
| const CuArray<int32> ©_from_idx, | ||
| CuMatrixBase<double> *tgt); | ||
|
|
||
| template | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is kind of a special purpose function, you don't have to instantiate for both float and double-- you can just hard-code it to BaseFloat. [i.e. not a template] |
||
| void ComputeXvectorObjfFromScores(const CuMatrixBase<float> &scores, | ||
| CuMatrixBase<float> *objf_terms, | ||
| CuMatrixBase<float> *objf_derivs); | ||
| template | ||
| void ComputeXvectorObjfFromScores(const CuMatrixBase<double> &scores, | ||
| CuMatrixBase<double> *objf_terms, | ||
| CuMatrixBase<double> *objf_derivs); | ||
|
|
||
|
|
||
|
|
||
| } //namespace cu | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To avoid separately having to zero the upper triangle and the diagonal of the matrix, you might as well do it in this kernel. [i.e. and set it to kUndefined before calling this kernel].
However, I suppose this all becomes moot if you end up using Pegah's idea and rely on the SoftHinge kernel and a fixed scaling matrix.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After looking at it more, I think it's better to just do this in a cuda kernel.
Also, I still need to make kernels for the actual derivatives, which are somewhat nontrivial to compute in an efficient way... I don't think it's possible to use Pegah's idea to handle them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the only not-100%-trivial thing about the derivatives is the fact
that different parts of the matrix have different scaling factors. You
could probably compute the objf and derivs as follows using individual
kernels.
fixed-scaling-2 (with 1/(num-rows-2) for different-class members) to get
the objf
are the derivatives of the objective function w.r.t. the raw scores.
There may be a few signs wrong here.
However, it would be more efficient to do all of the above in a single
kernel.
You can easily do it in the same kernel as computes the objective-function
terms. [do summation via matrix-sum though].
Dan
On Sat, Feb 13, 2016 at 3:58 PM, david-ryan-snyder <notifications@github.com
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you're describing an alternative way to get the coefficients for the derivative terms. But, the kernel code above already does that.
On the CPU, the derivative wrt to S needs something like the following (NOTE: I'm ignoring peculiarities due to S being symmetric):
Where C() is a coefficient dependent on whether or not the vectors at row i and j are from the same or different classes. This is what we calculated in the kernel above.
Each v,w pair results in its own matrix. I think this makes it harder to deal with in a single kernel. I think the easiest thing to do is to create an additional kernel that works like a modified form of matrix multiplication. Suppose V is the matrix of xvectors and D = NumCols(V). Then P = V' "times" V is the serialized outer product of each row of v. For example, P.Row(0) = Serialized( V.Row(0) * V.Row(0)'). In other words, p_{i,j} = v_{i, (j / D) % D} * v_{i, j % D}.
Once that is done, it should be more straightforward to calculate S_deriv += C(i, j) * (P.Row(i) + P.Row(j)) in parallel.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you are really thinking about this in the spirit of
backprop. The general principle is that you go forward computing the
objective function, and then you do a process that is roughly the
mirror-image of the forward process to backprop the derivatives through the
computation.
What I described was getting the derivatives of the objective function
w.r.t. the matrix of scores. After that you just have to do the reverse of
the forward operations to get the derivatives w.r.t. S and the matrix of
xvectors.
Dan
On Sat, Feb 13, 2016 at 4:51 PM, david-ryan-snyder <notifications@github.com
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, that's what I'm referring to. Once you have the derivs of the objf wrt to the scores (included in C(i,j)), you still need to compute the derivative of the scores wrt to S. However, as far as I can tell, unless you try to do that in a kernel, you'll end up with an algorithm with two loops over the xvectors (see psuedo-code in earlier post). I proposed the kernel above to parallelize that computation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, let me work this out...
The forward computation is something like:
A = X X'
cvec = diag(X S X')
u = vector of ones
S = A - cvec u' - u cvec + b
... compute the objf and get S_deriv which is d(objf)/dS
A_deriv = S_deriv
X_deriv += 2 A_deriv X (or something like that)
cvec_deriv = - sum-of-Sderiv-cols - sum-of-Sderiv-rows
when computing the deriv w.r.t. S I am thinking about the expression
cvec_deriv . cvec,
which equals trace(diag(cvec_deriv) X S X'), where diag(cvec_deriv) is a
matrix whose diagonal is cvec_deriv, which we can rearrange to trace(S (X'
diag(cvec_deriv) X)).
We get from this (through a mysterious process, I do it intuitively),
S_deriv = X' diag(cvec_deriv) X
which is pretty easy to compute.
On Sat, Feb 13, 2016 at 5:09 PM, david-ryan-snyder <notifications@github.com
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I'll play with it some more to see if I can get it to work without a kernel and without an O(N^2) computation.
In your procedure, it isn't obvious to me (yet) that you can get terms of the form S_deriv = C(x,y) * (x x' + y y') for all combinations of (x,y) pairs. That's where the O(N^2) comes from that I'm trying to avoid.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fact that it was possible in thee forward computation generally means
it's possible i the backward computation.
You'll get S_deriv = X' diag(cvec_deriv) X, I think.
On Sat, Feb 13, 2016 at 5:39 PM, david-ryan-snyder <notifications@github.com