Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
use (m, m) temp space
Browse files Browse the repository at this point in the history
  • Loading branch information
Fan committed Aug 14, 2019
1 parent 279e1ce commit da6d472
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 62 deletions.
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated from afd4b3 to 8bd9d4
6 changes: 6 additions & 0 deletions src/operator/c_lapack_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
#define MXNET_LAPACK_sgetrf LAPACKE_sgetrf
#define MXNET_LAPACK_dgetrf LAPACKE_dgetrf

// Internally A is factorized as U * L * VT, and (according to the tech report)
// we want to factorize it as UT * L * V, so we pass ut as u and v as vt.
// We also have to allocate at least m - 1 DType elements as workspace as the internal
// LAPACKE function needs it to store `superb`. (see MKL documentation)
#define MXNET_LAPACK_CWRAP_GESVD(prefix, dtype) \
inline int MXNET_LAPACK_##prefix##gesvd(int matrix_layout, int m, int n, dtype* ut, \
int ldut, dtype* s, dtype* v, int ldv, \
Expand Down Expand Up @@ -382,6 +386,8 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
MXNET_LAPACK_CWRAP_SYEVD(ssyevd, float)
MXNET_LAPACK_CWRAP_SYEVD(dsyevd, double)

// Note: Supports row-major format only. Internally, column-major is used, so all
// inputs/outputs are flipped and transposed. m and n are flipped as well.
#define MXNET_LAPACK_CWRAP_GESVD(func, dtype) \
inline int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* ut, \
int ldut, dtype* s, dtype* v, int ldv, \
Expand Down
3 changes: 1 addition & 2 deletions src/operator/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,7 @@ int linalg_syevd_workspace_query(const Tensor<xpu, 2, DType>& A,

// CPU/GPU-versions of LAPACK function "gesvd". Please refer to the
// LAPACK documentation for further details.
// Note:
// - V is input and output parameter (overwritten by A)
// Note: V is input and output parameter (it overwrites A)

template<typename xpu, typename DType>
void linalg_gesvd(const Tensor<xpu, 2, DType>& UT,
Expand Down
13 changes: 2 additions & 11 deletions src/operator/linalg_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1262,25 +1262,16 @@ void linalg_gesvd<cpu, DType>(const Tensor<cpu, 2, DType>& UT, \
const Tensor<cpu, 1, DType>& work, \
Stream<cpu> *s) { \
check_gesvd(UT, L, V); \
DType lwork(0); \
MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, V.size(0), V.size(1), \
UT.dptr_, UT.stride_, L.dptr_, V.dptr_, V.stride_, \
&lwork, -1); \
int lwork(work.size(0)); \
int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, V.size(0), V.size(1), \
UT.dptr_, UT.stride_, L.dptr_, V.dptr_, V.stride_, \
work.dptr_, static_cast<int>(lwork))); \
work.dptr_, lwork)); \
CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
}

LINALG_CPU_GESVD(sgesvd, float)
LINALG_CPU_GESVD(dgesvd, double)

// Mangle temp storage requirements for DType and int into a single
// request as we can only allocate one temp space per operator. We
// partition this temp space into two chunks again when calling sseyvd.
// Returned is the number of elements of type DType that the temp space
// needs to accomodate. This also makes this function signature equivalent
// to the work space query on GPU.
#define LINALG_CPU_GESVD_WORKSPACE_QUERY(func, DType) \
template<> inline \
int linalg_gesvd_workspace_query<cpu, DType>(const Tensor<cpu, 2, DType>& UT, \
Expand Down
94 changes: 48 additions & 46 deletions src/operator/numpy/linalg/np_gesvd-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ struct GesvdVecSign {

// (UT, L, V) = gesvd(A) [singular value decomposition]
// - V can overwrite A
// - Needs workspace (both DType and int), size of which is determined by a
// workspace query
// - Needs workspace (DType), size of which is determined by a workspace query
struct gesvd {
template<typename xpu, typename DType>
static void op(const Tensor<xpu, 3, DType>& A,
Expand Down Expand Up @@ -126,6 +125,7 @@ MSHADOW_XINLINE double gesvd_back_helper_eps(double* X) {
return 1e-100;
}

// dA overwritten by L^-1 dA
struct GesvdBackHelper_dV {
template<typename DType>
MSHADOW_XINLINE static void Map(int k, int m, int n, DType* L, int ldl,
Expand All @@ -144,14 +144,18 @@ struct GesvdBackHelper_dV {
}
};

// X (square) overwritten by X L
// Y overwritten by the diagonal of X
struct GesvdBackHelper_G1 {
template<typename DType>
MSHADOW_XINLINE static void Map(int k, int m, int n, DType* X, int ldx,
DType* L, int ldl) {
DType* L, int ldl, DType* Y, int ldy) {
const int offl(k * ldl);
const int offy(k * ldy);
const int offx(k * m * ldx);
DType numer(0.0);
for (int i = 0; i < m; ++i) {
Y[offy + i] = X[offx + i * ldx + i];
for (int j = 0; j < m; ++j) {
numer = L[offl + j];
X[offx + i * ldx + j] *= numer;
Expand All @@ -164,16 +168,15 @@ struct GesvdBackHelper_G2 {
template<typename DType>
MSHADOW_XINLINE static void Map(int k, int m, int n, DType* X, int ldx,
DType* L, int ldl, DType* dL, int lddl,
DType* dA, int ldda, DType* V, int ldv) {
DType* Y, int ldy) {
const int offx(k * m * ldx);
const int offl(k * ldl);
const int offdl(k * lddl);
const int offda(k * m * ldda);
const int offv(k * m * ldv);
const int offy(k * ldy);
const DType eps(gesvd_back_helper_eps(X));
DType denom1(0.0), denom2(0.0), elem(0.0);

for (int i = 0; i < m - 1; ++i) {
for (int i = 0; i < m; ++i) {
for (int j = i + 1; j < m; ++j) {
denom1 = L[offl + i] - L[offl + j];
denom2 = L[offl + i] + L[offl + j];
Expand All @@ -183,14 +186,7 @@ struct GesvdBackHelper_G2 {
X[offx + i * ldx + j] = elem * L[offl + j];
X[offx + j * ldx + i] = elem * L[offl + i];
}
}
for (int i = 0; i < m; ++i) {
elem = DType(0.0);
for (int j = 0; j < n; ++j) {
elem += dA[offda + i * ldda + j] * V[offv + i * ldv + j];
}
elem = -elem + dL[offdl + i];
X[offx + i * ldx + i] = elem;
X[offx + i * ldx + i] = -Y[offy + i] + dL[offdl + i];
}
}
};
Expand All @@ -204,41 +200,49 @@ struct gesvd_backward {
const Tensor<xpu, 2, DType>& L,
const Tensor<xpu, 3, DType>& V,
const Tensor<xpu, 3, DType>& dA,
const Tensor<xpu, 3, DType>& tempMs,
const Tensor<xpu, 3, DType>& tempMr,
const Tensor<xpu, 3, DType>& tempM,
const Tensor<xpu, 2, DType>& tempMd,
Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
// Backward of (UT, L, V) = gesvd(A)
using namespace mxnet_op;
if (dA.dptr_ != dV.dptr_) {
Copy(dA, dV, s);
}
// From here on, we work on dA only
int k = dA.size(0), m = dA.size(1), n = dA.size(2);

// Need temporal space, same shape as dUT
// invdV:
Kernel<GesvdBackHelper_dV, xpu>::Launch
(s, dA.size(0), dA.size(1), dA.size(2), L.dptr_, L.stride_, dA.dptr_, dA.stride_);
(s, k, m, n, L.dptr_, L.stride_, dA.dptr_, dA.stride_);

// G1:
// This copy is just to make sure there are no invalid values (NaN, infinity) in tempM
Copy(tempMs, dUT, s);
Copy(tempMr, dA, s);
gemm::op(dA, V, tempMs, DType(1.0), DType(0.0), false, true, s);
// This is just to make sure there are no invalid values (NaN, infinity) in tempM and tempMd
tempM.FlatTo1D() = 0;
tempMd.FlatTo1D() = 0;
gemm::op(dA, V, tempM, DType(1.0), DType(0.0), false, true, s);
Kernel<GesvdBackHelper_G1, xpu>::Launch
(s, dA.size(0), dA.size(1), dA.size(2), tempMs.dptr_, tempMs.stride_,
L.dptr_, L.stride_);
gemm::op(dUT, UT, tempMs, DType(1.0), DType(1.0), true, false, s);
(s, k, m, n, tempM.dptr_, tempM.stride_,
L.dptr_, L.stride_, tempMd.dptr_, tempMd.stride_);
gemm::op(dUT, UT, tempM, DType(1.0), DType(1.0), true, false, s);

// G2:
Kernel<GesvdBackHelper_G2, xpu>::Launch
(s, dA.size(0), dA.size(1), dA.size(2), tempMs.dptr_, tempMs.stride_,
L.dptr_, L.stride_, dL.dptr_, dL.stride_, dA.dptr_, dA.stride_,
V.dptr_, V.stride_);
(s, k, m, n, tempM.dptr_, tempM.stride_,
L.dptr_, L.stride_, dL.dptr_, dL.stride_,
tempMd.dptr_, tempMd.stride_);

// G3:
gemm::op(tempMs, V, dA, DType(1.0), DType(1.0), false, false, s);
gemm::op(UT, dA, tempMr, DType(1.0), DType(0.0), false, false, s);
Copy(dA, tempMr, s);
gemm::op(tempM, V, dA, DType(1.0), DType(1.0), false, false, s);
for (int i = 0; i < n; i += m) {
int ncols = n - i < m ? n - i : m;
Tensor<xpu, 3, DType> t = Tensor<xpu, 3, DType>(dA.dptr_ + i,
Shape3(k, m, ncols), dA.stride_, dA.stream_);
Tensor<xpu, 3, DType> out = Tensor<xpu, 3, DType>(tempM.dptr_,
Shape3(k, m, ncols), tempM.stride_, tempM.stream_);
gemm::op(UT, t, out, DType(1.0), DType(0.0), false, false, s);
Copy(t, out, s);
}
}
};

Expand All @@ -258,23 +262,21 @@ void NumpyLaGesvdBackward(const nnvm::NodeAttrs& attrs,
}
MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
TBlob tspace(outputs[0]);
TBlob tempMs, tempMr;
TBlob tempM, tempMd;
int kmn = outputs[0].shape_.Size();
int kmm = inputs[0].shape_.Size();
int km = inputs[1].shape_.Size();
if (req[0] == kAddTo) {
Tensor<xpu, 1, OType> tempspace = ctx.requested[0]
.get_space_typed<xpu, 1, OType>(Shape1(2 * outputs[0].shape_.Size()), s);
tspace = TBlob(tempspace.Slice(0, outputs[0].shape_.Size()))
.reshape(outputs[0].shape_);
tempMs = TBlob(tempspace.Slice(outputs[0].shape_.Size(),
outputs[0].shape_.Size() + inputs[0].shape_.Size()))
.reshape(inputs[0].shape_);
tempMr = TBlob(tempspace.Slice(outputs[0].shape_.Size(),
2 * outputs[0].shape_.Size()))
.reshape(outputs[0].shape_);
.get_space_typed<xpu, 1, OType>(Shape1(kmn + kmm + km), s);
tspace = TBlob(tempspace.Slice(0, kmn)).reshape(outputs[0].shape_);
tempM = TBlob(tempspace.Slice(kmn, kmn + kmm)).reshape(inputs[0].shape_);
tempMd = TBlob(tempspace.Slice(kmn + kmm, kmn + kmm + km)).reshape(inputs[1].shape_);
} else {
Tensor<xpu, 1, OType> tempspace = ctx.requested[0]
.get_space_typed<xpu, 1, OType>(Shape1(outputs[0].shape_.Size()), s);
tempMs = TBlob(tempspace.Slice(0, inputs[0].shape_.Size())).reshape(inputs[0].shape_);
tempMr = TBlob(tempspace.Slice(0, outputs[0].shape_.Size())).reshape(outputs[0].shape_);
.get_space_typed<xpu, 1, OType>(Shape1(kmm + km), s);
tempM = TBlob(tempspace.Slice(0, kmm)).reshape(inputs[0].shape_);
tempMd = TBlob(tempspace.Slice(kmm, kmm + km)).reshape(inputs[1].shape_);
}
laop::op(inputs[0].FlatToKD<xpu, 3, OType>(s), // dUT
inputs[1].FlatToKD<xpu, 2, OType>(s), // dL
Expand All @@ -283,8 +285,8 @@ void NumpyLaGesvdBackward(const nnvm::NodeAttrs& attrs,
inputs[4].FlatToKD<xpu, 2, OType>(s), // L
inputs[5].FlatToKD<xpu, 3, OType>(s), // V
tspace.FlatToKD<xpu, 3, OType>(s), // dA
tempMs.FlatToKD<xpu, 3, OType>(s), // tempMs
tempMr.FlatToKD<xpu, 3, OType>(s), // tempMr
tempM.FlatToKD<xpu, 3, OType>(s), // tempM
tempMd.FlatToKD<xpu, 2, OType>(s), // tempMd
s, attrs);
if (req[0] == kAddTo) {
Tensor<xpu, 1, OType> out = outputs[0].FlatTo1D<xpu, OType>(s);
Expand Down
3 changes: 1 addition & 2 deletions tests/python/unittest/test_numpy_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,8 +878,7 @@ def get_grad(UT, L, V):
for hybridize in [True, False]:
for dtype in dtypes:
for shape in shapes:
rtol = 1e-3
atol = 1e-3
rtol = atol = 0.01
test_svd = TestSVD()
if hybridize:
test_svd.hybridize()
Expand Down

0 comments on commit da6d472

Please sign in to comment.