From ca6a5606f18a80c390312b008c341c9bf240d533 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 25 Jul 2014 23:58:55 +0000 Subject: [PATCH 001/147] add 1D pack/unpack --- mshadow/tensor_expr_ext.h | 195 +++++++++++++++++++++++++++++++++++--- 1 file changed, 183 insertions(+), 12 deletions(-) diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index 8399b1b7a26b..6184d5be15f4 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -90,6 +90,65 @@ namespace mshadow{ } }; + /*! + * \brief unpack local (overlap) windows of a series to column of mat, can be used to implement 1D convolution, + * this expression allow unpack of a batch this is a version support unpacking multiple series + * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: + * \tparam SrcExp source expression + * \tparam dstdim destination dimension + */ + template + struct UnpackWindowToColExp: public MakeTensorExp< UnpackWindowToColExp, SrcExp, 2>{ + /*! \brief source operand */ + const SrcExp& series_; + /*! \brief patch size */ + index_t psize_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief number of input channel */ + index_t i_channel_; + /*! \brief length of img */ + index_t i_length_; + + /*! \brief constructor */ + UnpackWindowToColExp( const SrcExp series, index_t psize, index_t pstride ) + :series_(series), psize_(psize), pstride_(pstride){ + Shape ishape = ShapeCheck::Check( series_ ); + utils::Assert( ishape[0] >= psize , "UnpackWindowToCol:series length smaller than patch size"); + this->i_channel_ = ishape[2] * ishape[1]; + this->i_length_ = ishape[0]; + // calculate number of batches + const index_t num = ishape.ProdShape( 3, srcdim ); + const index_t o_length = ( i_length_ - psize ) / pstride + 1; + this->shape_[0] = o_length * num; + this->shape_[1] = psize * i_channel_; + } + }; + + /*! + * \brief reverse operation of UnpackWindowToCol, used to backprop gradient back + * this is a version supporting multiple images + * \tparam Device which device it lies + * \tparam dstdim destination dimension + */ + template + struct PackColToWindowExp: public MakeTensorExp< PackColToWindowExp, Tensor, dstdim>{ + /*! \brief source operand */ + const Tensor& mat_; + /*! \brief patch size */ + index_t psize_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief constructor */ + PackColToWindowExp( const Tensor &mat, Shape ishape, index_t psize, index_t pstride ) + :mat_(mat), psize_(psize), pstride_(pstride){ + this->shape_ = ishape; + const index_t o_length = ( ishape[0] - psize ) / pstride + 1; + utils::Assert( mat.shape[0] == o_length * ishape.ProdShape(3,dstdim), "PackColToWindowExp: mat.shape[0] mismatch" ); + utils::Assert( mat.shape[1] == psize * ishape[1] * ishape[2], "PackColToWindowExp: mat.shape[1] mismatch" ); + } + }; + /*! * \brief reshape the content to another shape * input: Tensor: ishape @@ -234,20 +293,22 @@ namespace mshadow{ struct PaddingExp : public MakeTensorExp, SrcExp, srcdim> { /*! \brief source operand */ const SrcExp& src_; - /*! \brief pad size */ - index_t pad_; + /*! \brief pad size in y */ + index_t pad_y_; + /*! \brief pad size in x */ + index_t pad_x_; /*! \brief source tensor height */ index_t src_height_; /*! \brief source tensor width */ index_t src_width_; /*! \brief constructor */ - PaddingExp( const SrcExp &src, index_t pad ) - : src_(src), pad_(pad) { + PaddingExp( const SrcExp &src, index_t pad_y, index_t pad_x ) + : src_(src), pad_y_(pad_y), pad_x_(pad_x) { this->shape_ = ShapeCheck::Check( src_ ); src_height_ = this->shape_[1]; src_width_ = this->shape_[0]; - this->shape_[1] += pad * 2; // height - this->shape_[0] += pad * 2; // width + this->shape_[1] += pad_y * 2; // height + this->shape_[0] += pad_x * 2; // width } }; @@ -393,6 +454,44 @@ namespace mshadow{ utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size"); return PackColToPatchXExp( mat, imshape, psize, pstride ); } + + + /*! + * \brief unpack local (overlap) windows of time series to column of mat, can be used to implement 1D convolution + * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: + * + * weight; shape[1]: out_channel, shape[0]: ichannel*psize + * output; shape[1]: out_channel, shape[0]: out_length * num_of_series + * out_length = ( in_length - psize ) / pstride + 1 + * + * \return mat target matrix; shape[1]: in_channel*psize shape[0]: out_length * num_of_series + * \param series source series; shape[2]*shape[1]: in_channels, shape[0]: in_length, can be 3D or 4D tensor(multiple series), either dimension 1 or 2 can be used as channel + * \param psize size of each window + * \param pstride stride of each window + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template + inline UnpackWindowToColExp::kDim > unpack_window2col( const Exp &series, index_t psize, index_t pstride ){ + TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return UnpackWindowToColExp::kDim >( series.self(), psize, pstride ); + } + + /*! + * \brief reverse operation of pack_col2window, can be used to implement deconvolution + * \return packed img expression + * \param mat source matrix + * \param tshape shape of target series + * \param psize size of each window + * \param pstride stride of each window + * \tparam Device the Device where input data lies + */ + template + inline PackColToWindowExp pack_col2window( const Tensor &mat, Shape tshape, index_t psize, index_t pstride ){ + utils::Assert( tshape[0] >= psize, "PackColToWindow:series length smaller than patch size"); + return PackColToWindowExp( mat, tshape, psize, pstride ); + } + /*! * \brief a expression that reshapes a tensor to another shape * \param src Tensor: @@ -495,9 +594,25 @@ namespace mshadow{ template inline PaddingExp::kDim> pad(const Exp &src, index_t pad) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PaddingExp::kDim>(src.self(), pad); + return PaddingExp::kDim>(src.self(), pad, pad); + } + + /*! + * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] + * \param src original image batches + * \param pad_y padding size in y + * \param pad_x padding size in x + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template + inline PaddingExp::kDim> pad(const Exp &src, index_t pad_y, index_t pad_x) { + TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PaddingExp::kDim>(src.self(), pad_y, pad_x); } + /*! * \brief revserse operationg of padding, cut off boundaries, crop output from center of input * \param src original image batches @@ -701,6 +816,60 @@ namespace mshadow{ }; }; + namespace expr{ + template + struct Plan< UnpackWindowToColExp >{ + public: + Plan( const UnpackWindowToColExp &e ) + :src_(MakePlan(e.series_)),psize_(e.psize_), pstride_(e.pstride_), + i_channel_(e.i_channel_), i_length_(e.i_length_), + o_length_(( i_length_ - psize_) / pstride_+1 ){ + } + MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + const index_t x_offset = i % psize_; + const index_t c = i / psize_; + const index_t x = ( j % o_length_ ) * pstride_ + x_offset; + const index_t n = j / o_length_; + + if( x < o_length_ ){ + return src_.Eval( n * i_channel_ + c , x ); + }else{ + return 0.0f; + } + } + private: + Plan src_; + const index_t psize_, pstride_, i_channel_, i_length_, o_length_; + }; + + template + struct Plan< PackColToWindowExp >{ + public: + Plan( const PackColToWindowExp &e ) + :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_), + i_channel_(e.shape_[2]), + o_length_(( e.shape_[0] - psize_ ) / pstride_ + 1){ + } + MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + using namespace std; + const index_t c = i % i_channel_; + const index_t n = i / i_channel_; + const index_t x = j; + const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_; + const index_t px_max = min( (x+pstride_)/pstride_, o_length_ ); + + real_t res = 0.0f; + for( index_t px = px_min; px < px_max; ++px ){ + res += mat_[ c * psize_ + x - px*pstride_ ][ n * o_length_ +px ]; + } + return res; + } + private: + Tensor mat_; + const index_t psize_, pstride_, i_channel_, o_length_; + }; + }; + namespace expr{ template struct Plan< ReshapeExp >{ @@ -853,15 +1022,16 @@ namespace mshadow{ struct Plan< PaddingExp > { public: Plan(const PaddingExp &e) - : src_(MakePlan(e.src_)), pad_(e.pad_), new_height_(e.shape_[1]), + : src_(MakePlan(e.src_)), pad_y_(e.pad_y_), pad_x_(e.pad_x_), + new_height_(e.shape_[1]), src_height_(e.src_height_), src_width_(e.src_width_) {} MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { const index_t x = j; const index_t y = i % new_height_; const index_t c = i / new_height_; - if (y < pad_ || x < pad_) return 0.0f; - const index_t h = y - pad_; - const index_t w = x - pad_; + if (y < pad_y_ || x < pad_x_) return 0.0f; + const index_t h = y - pad_y_; + const index_t w = x - pad_x_; if (h < src_height_ && w < src_width_) { return src_.Eval(c * src_height_ + h, w); } else { @@ -870,7 +1040,8 @@ namespace mshadow{ } private: Plan src_; - const index_t pad_; + const index_t pad_y_; + const index_t pad_x_; const index_t new_height_; const index_t src_height_; const index_t src_width_; From 0be3244a6a95d3d412b0f6202481adb46b4fc43c Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 26 Jul 2014 01:24:14 +0000 Subject: [PATCH 002/147] pooling support rectangle shape, old pooling function is deleted --- mshadow/tensor_expr_ext.h | 81 ++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index 6184d5be15f4..60e410ed128b 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -223,8 +223,10 @@ namespace mshadow{ struct PoolingExp: public MakeTensorExp< PoolingExp, SrcExp, srcdim> { /*! \brief source operand */ const SrcExp& src_; - /*! \brief kernel size */ - index_t ksize_; + /*! \brief kernel size in height */ + index_t ksize_y_; + /*! \brief kernel size in width */ + index_t ksize_x_; /*! \brief kernel stride */ index_t kstride_; /*! \brief source height shape[1] */ @@ -232,21 +234,21 @@ namespace mshadow{ /*! \brief source width shape[0] */ index_t src_width_; /*! \brief constructor */ - PoolingExp( const SrcExp &src, index_t ksize, index_t kstride ) - : src_(src), ksize_(ksize), kstride_(kstride) { + PoolingExp( const SrcExp &src, index_t ksize_y, index_t ksize_x, index_t kstride ) + : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); - utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" ); + utils::Assert( sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image" ); this->src_height_ = sshape[1]; this->src_width_ = sshape[0]; this->shape_ = sshape; - this->shape_[1] = (src_height_ - ksize) / kstride + 1; - this->shape_[0] = (src_width_ - ksize) / kstride + 1; + this->shape_[1] = (src_height_ - ksize_y) / kstride + 1; + this->shape_[0] = (src_width_ - ksize_x) / kstride + 1; } /*! \brief constructor, specify shape */ - PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, index_t kstride ) - : src_(src), ksize_(ksize), kstride_(kstride) { + PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride ) + : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); - utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" ); + utils::Assert( sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image" ); this->src_height_ = sshape[1]; this->src_width_ = sshape[0]; this->shape_ = sshape; @@ -268,15 +270,17 @@ namespace mshadow{ const Tensor& data_pooled_; /*! \brief gradient data of pooled part, to be propgate down */ const Tensor& grad_pooled_; - /*! \brief kernel size */ - index_t ksize_; + /*! \brief kernel size in height */ + index_t ksize_y_; + /*! \brief kernel size in width */ + index_t ksize_x_; /*! \brief kernel stride */ index_t kstride_; /*! \brief constructor */ UnPoolingExp( const Tensor &data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize, index_t kstride ) + const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride ) : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled), - ksize_(ksize), kstride_(kstride) { + ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" ); utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" ); utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" ); @@ -538,23 +542,26 @@ namespace mshadow{ /*! * \brief pooling subregion results together * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width - * \param ksize kernel size + * \param ksize_y kernel size in height + * \param ksize_x kernel size in width * \param kstride stride for each kernel * \return expression of pooled result * \tparam Reducer reducer type * \tparam SrcExp source expression * \tparam etype type of expression - */ + */ template - inline PoolingExp::kDim > pool( const Exp &src, index_t ksize, index_t kstride ) { + inline PoolingExp::kDim > pool( const Exp &src, index_t ksize_y, index_t ksize_x, index_t kstride ) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PoolingExp::kDim >(src.self(), ksize, kstride); + return PoolingExp::kDim >(src.self(), ksize_y, ksize_x, kstride); } + /*! * \brief same as pool, except the output shape is specified by pshape * \param src source image * \param pshape ouput shape - * \param ksize kernel size + * \param ksize_y kernel size in y + * \param ksize_x kernel size in x * \param kstride stride for each kernel * \return expression of pooled result * \tparam Reducer reducer type @@ -562,16 +569,18 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline PoolingExp::kDim > pool( const Exp &src, Shape<2> pshape, index_t ksize, index_t kstride ) { + inline PoolingExp::kDim > pool( const Exp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride ) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PoolingExp::kDim >(src.self(), pshape, ksize, kstride); + return PoolingExp::kDim >(src.self(), pshape, ksize_y, ksize_x, kstride); } + /*! - * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling + * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling, same as unpooling, but allows unequal size of kernel * \param data_src source input, corresponds to src in pooling * \param data_pooled result of pooled data, corresponds to result of pooling * \param grad_pooled gradient data of pooled part, to be propgate down - * \param ksize kernel size + * \param ksize_y kernel height + * \param ksize_x kernel width * \param kstride stride for each kernel * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient * \tparam Reducer reducer type @@ -579,8 +588,8 @@ namespace mshadow{ */ template inline UnPoolingExp unpool( const Tensor&data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize, index_t kstride ) { - return UnPoolingExp(data_src, data_pooled, grad_pooled,ksize, kstride); + const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride ) { + return UnPoolingExp(data_src, data_pooled, grad_pooled, ksize_y, ksize_x, kstride); } /*! @@ -956,19 +965,20 @@ namespace mshadow{ struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > { public: Plan( const PoolingExp &e ) - : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), kstride_(e.kstride_), + : src_( MakePlan( e.src_ ) ), ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), + kstride_(e.kstride_), src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) { } MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { using namespace std; const index_t py = i % new_height_; const index_t y_start = py * kstride_; - const index_t y_end = min( y_start + ksize_, src_height_ ); + const index_t y_end = min( y_start + ksize_y_, src_height_ ); const index_t px = j; const index_t x_start = px * kstride_; - const index_t x_end = min( x_start + ksize_, src_width_ ); + const index_t x_end = min( x_start + ksize_x_, src_width_ ); const index_t c = i / new_height_; - + real_t res = Reducer::kInitV; for (index_t y = y_start; y < y_end; ++y) { for (index_t x = x_start; x < x_end; ++x) { @@ -979,7 +989,7 @@ namespace mshadow{ } private: Plan src_; - const index_t ksize_, kstride_; + const index_t ksize_y_, ksize_x_, kstride_; const index_t src_height_, src_width_; const index_t new_height_; }; @@ -989,7 +999,7 @@ namespace mshadow{ public: Plan(const UnPoolingExp &e) : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_), - ksize_(e.ksize_), kstride_(e.kstride_) {} + ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_) {} MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { using namespace std; const index_t x = j; @@ -997,22 +1007,23 @@ namespace mshadow{ const index_t c = i / data_src_.shape[1]; const real_t vsrc = data_src_[0][c][y][x]; - const index_t py_min = y < ksize_ ? 0 : (y-ksize_+kstride_)/kstride_; - const index_t px_min = x < ksize_ ? 0 : (x-ksize_+kstride_)/kstride_; + const index_t py_min = y < ksize_y_ ? 0 : (y-ksize_y_+kstride_)/kstride_; + const index_t px_min = x < ksize_x_ ? 0 : (x-ksize_x_+kstride_)/kstride_; const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]); const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]); real_t val = 0; for( index_t py = py_min; py < py_max; ++py ){ for( index_t px = px_min; px < px_max; ++px ){ - val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px]; + val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * + grad_pooled_[0][c][py][px]; } } return val; } private: Tensor data_src_, data_pooled_, grad_pooled_; - const index_t ksize_; + const index_t ksize_y_, ksize_x_; const index_t kstride_; }; }; // namespace expr From ea5e09303d9d8131fd9439ec605c3cf6df92a72e Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 26 Jul 2014 01:54:40 +0000 Subject: [PATCH 003/147] pack/unpack now support rectangle shape, remove old version --- mshadow/tensor_expr_ext.h | 91 +++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index 60e410ed128b..a4af432a09b6 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -38,8 +38,10 @@ namespace mshadow{ struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp, SrcExp, 2>{ /*! \brief source operand */ const SrcExp& img_; - /*! \brief patch size */ - index_t psize_; + /*! \brief patch height */ + index_t psize_y_; + /*! \brief patch width */ + index_t psize_x_; /*! \brief patch stride */ index_t pstride_; /*! \brief number of input channel */ @@ -49,19 +51,19 @@ namespace mshadow{ /*! \brief width of img */ index_t i_width_; /*! \brief constructor */ - UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t pstride ) - :img_(img), psize_(psize), pstride_(pstride){ + UnpackPatchToColXExp( const SrcExp &img, index_t psize_y, index_t psize_x, index_t pstride ) + :img_(img), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ Shape imshape = ShapeCheck::Check( img_ ); - utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "UnpackPatchToCol:image shape smaller than patch size"); + utils::Assert( imshape[0] >= psize_x && imshape[1] >= psize_y, "UnpackPatchToCol:image shape smaller than patch size"); this->i_channel_ = imshape[2]; this->i_height_ = imshape[1]; this->i_width_ = imshape[0]; // calculate number of batches const index_t num = imshape.ProdShape( 3, srcdim ); - const index_t o_height = ( i_height_ - psize ) / pstride + 1; - const index_t o_width = ( i_width_ - psize ) / pstride + 1; + const index_t o_height = ( i_height_ - psize_y ) / pstride + 1; + const index_t o_width = ( i_width_ - psize_x ) / pstride + 1; this->shape_[0] = o_height * o_width * num; - this->shape_[1] = psize * psize * imshape[2]; + this->shape_[1] = psize_y * psize_x * imshape[2]; } }; @@ -75,18 +77,20 @@ namespace mshadow{ struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp, Tensor, dstdim>{ /*! \brief source operand */ const Tensor& mat_; - /*! \brief patch size */ - index_t psize_; + /*! \brief patch height */ + index_t psize_y_; + /*! \brief patch height */ + index_t psize_x_; /*! \brief patch stride */ index_t pstride_; /*! \brief constructor */ - PackColToPatchXExp( const Tensor &mat, Shape imshape, index_t psize, index_t pstride ) - :mat_(mat), psize_(psize), pstride_(pstride){ + PackColToPatchXExp( const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride ) + :mat_(mat), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ this->shape_ = imshape; - const index_t o_height = ( imshape[1] - psize ) / pstride + 1; - const index_t o_width = ( imshape[0] - psize ) / pstride + 1; + const index_t o_height = ( imshape[1] - psize_y ) / pstride + 1; + const index_t o_width = ( imshape[0] - psize_x ) / pstride + 1; utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" ); - utils::Assert( mat.shape[1] == psize * psize * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" ); + utils::Assert( mat.shape[1] == psize_y * psize_x * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" ); } }; @@ -426,22 +430,23 @@ namespace mshadow{ * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: * - * weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize + * weight; shape[1]: out_channel, shape[0]: ichannel*psize_y*psize_x * output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images - * out_height = ( in_height - psize ) / pstride + 1, this means we pad inperfect patch with 0 - * out_width = ( in_width - psize ) / pstride + 1 + * out_height = ( in_height - psize_y ) / pstride + 1, this means we pad inperfect patch with 0 + * out_width = ( in_width - psize_x ) / pstride + 1 * - * \return mat target matrix; shape[1]: in_channel*psize*psize shape[0]: out_height*out_width * num_of_images + * \return mat target matrix; shape[1]: in_channel*psize_y*psize_x shape[0]: out_height*out_width * num_of_images * \param img source image; shape[2]: in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images) - * \param psize height and width of each patch + * \param psize_y height of each patch + * \param psize_x width of each patch * \param pstride stride of each patch * \tparam SrcExp source expression * \tparam etype type of expression */ template - inline UnpackPatchToColXExp::kDim > unpack_patch2col( const Exp &img, index_t psize, index_t pstride ){ + inline UnpackPatchToColXExp::kDim > unpack_patch2col( const Exp &img, index_t psize_y, index_t psize_x, index_t pstride ){ TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return UnpackPatchToColXExp::kDim >( img.self(), psize, pstride ); + return UnpackPatchToColXExp::kDim >( img.self(), psize_y, psize_x, pstride ); } /*! @@ -449,14 +454,15 @@ namespace mshadow{ * \return packed img expression * \param mat source matrix * \param imshape shape of target img - * \param psize height and width of each patch + * \param psize_y height of each patch + * \param psize_x height of each patch * \param pstride stride of each patch * \tparam Device the Device where input data lies */ template - inline PackColToPatchXExp pack_col2patch( const Tensor &mat, Shape imshape, index_t psize, index_t pstride ){ - utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size"); - return PackColToPatchXExp( mat, imshape, psize, pstride ); + inline PackColToPatchXExp pack_col2patch( const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride ){ + utils::Assert( imshape[0] >= psize_x && imshape[1] >= psize_y, "PackColToPatch:image shape smaller than patch size"); + return PackColToPatchXExp( mat, imshape, psize_y, psize_x, pstride ); } @@ -764,16 +770,16 @@ namespace mshadow{ struct Plan< UnpackPatchToColXExp >{ public: Plan( const UnpackPatchToColXExp &e ) - :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_), + :src_(MakePlan(e.img_)), psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), - o_height_(( i_height_ - psize_ ) / pstride_ + 1), - o_width_ (( i_width_ - psize_ ) / pstride_ + 1){ + o_height_(( i_height_ - psize_y_ ) / pstride_ + 1), + o_width_ (( i_width_ - psize_x_ ) / pstride_ + 1){ } MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ - const index_t x_offset = i % psize_; - const index_t idivp = i / psize_; - const index_t y_offset = idivp % psize_; - const index_t c = idivp / psize_; + const index_t x_offset = i % psize_x_; + const index_t idivp = i / psize_x_; + const index_t y_offset = idivp % psize_y_; + const index_t c = idivp / psize_y_; const index_t x = (j % o_width_) * pstride_ + x_offset; const index_t jdivw = j / o_width_; const index_t y = (jdivw % o_height_) * pstride_ + y_offset; @@ -787,17 +793,17 @@ namespace mshadow{ } private: Plan src_; - const index_t psize_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_; + const index_t psize_y_, psize_x_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_; }; template struct Plan< PackColToPatchXExp >{ public: Plan( const PackColToPatchXExp &e ) - :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_), + :mat_(e.mat_), psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), i_channel_(e.shape_[2]), i_height_(e.shape_[1]), - o_width_(( e.shape_[0] - psize_ ) / pstride_ + 1), - o_height_(( e.shape_[1] - psize_ ) / pstride_ + 1){ + o_height_(( e.shape_[1] - psize_y_ ) / pstride_ + 1), + o_width_(( e.shape_[0] - psize_x_ ) / pstride_ + 1){ // note: i/o convention are same as unpack } MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ @@ -807,21 +813,21 @@ namespace mshadow{ const index_t c = idivh % i_channel_; const index_t n = idivh / i_channel_; const index_t x = j; - const index_t py_min = y < psize_ ? 0 : (y-psize_+pstride_)/pstride_; - const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_; + const index_t py_min = y < psize_y_ ? 0 : (y-psize_y_+pstride_)/pstride_; + const index_t px_min = x < psize_x_ ? 0 : (x-psize_x_+pstride_)/pstride_; const index_t py_max = min( (y+pstride_)/pstride_, o_height_); const index_t px_max = min( (x+pstride_)/pstride_, o_width_ ); real_t res = 0.0f; for( index_t py = py_min; py < py_max; ++py ){ for( index_t px = px_min; px < px_max; ++px ){ - res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ]; + res += mat_[ (c * psize_y_ + y - py*pstride_) * psize_x_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ]; } } return res; } private: Tensor mat_; - const index_t psize_, pstride_, i_channel_, i_height_, o_width_, o_height_; + const index_t psize_y_, psize_x_, pstride_, i_channel_, i_height_, o_height_, o_width_; }; }; @@ -1015,8 +1021,7 @@ namespace mshadow{ real_t val = 0; for( index_t py = py_min; py < py_max; ++py ){ for( index_t px = px_min; px < px_max; ++px ){ - val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * - grad_pooled_[0][c][py][px]; + val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px]; } } return val; From 99c8df1dd7e8201b3cd2731e390820eec17f47ad Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 26 Jul 2014 04:29:09 +0000 Subject: [PATCH 004/147] more consistent boundary check --- mshadow/tensor_expr_ext.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index a4af432a09b6..73ab282fdc9e 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -350,8 +350,8 @@ namespace mshadow{ CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width ) : src_(src), pad_height_(start_height), pad_width_(start_width) { this->shape_ = ShapeCheck::Check( src_ ); - utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); - utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); + utils::Assert(this->shape_[1] >= cshape[1]+start_height, "CroppingExp: height requirement not met"); + utils::Assert(this->shape_[0] >= cshape[0]+start_width, "CroppingExp: width requirement not met"); src_height_ = this->shape_[1]; this->shape_[1] = cshape[1]; // width this->shape_[0] = cshape[0]; // height From 9b1e536a3382811c34487d9f8bf8565ae2fe0310 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 29 Jul 2014 00:02:12 +0000 Subject: [PATCH 005/147] sparse input --- mshadow/tensor_base.h | 2 +- mshadow/tensor_expr_ext.h | 150 -------------------------------------- 2 files changed, 1 insertion(+), 151 deletions(-) diff --git a/mshadow/tensor_base.h b/mshadow/tensor_base.h index b251cbadf4fc..fe09960d2445 100644 --- a/mshadow/tensor_base.h +++ b/mshadow/tensor_base.h @@ -56,7 +56,7 @@ #endif /*! \brief whether use SSE */ #ifndef MSHADOW_USE_SSE - #define MSHADOW_USE_SSE 1 + #define MSHADOW_USE_SSE 0 #endif /*! \brief whether use NVML to get dynamic info */ #ifndef MSHADOW_USE_NVML diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index 73ab282fdc9e..39742b8aecda 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -94,65 +94,6 @@ namespace mshadow{ } }; - /*! - * \brief unpack local (overlap) windows of a series to column of mat, can be used to implement 1D convolution, - * this expression allow unpack of a batch this is a version support unpacking multiple series - * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: - * \tparam SrcExp source expression - * \tparam dstdim destination dimension - */ - template - struct UnpackWindowToColExp: public MakeTensorExp< UnpackWindowToColExp, SrcExp, 2>{ - /*! \brief source operand */ - const SrcExp& series_; - /*! \brief patch size */ - index_t psize_; - /*! \brief patch stride */ - index_t pstride_; - /*! \brief number of input channel */ - index_t i_channel_; - /*! \brief length of img */ - index_t i_length_; - - /*! \brief constructor */ - UnpackWindowToColExp( const SrcExp series, index_t psize, index_t pstride ) - :series_(series), psize_(psize), pstride_(pstride){ - Shape ishape = ShapeCheck::Check( series_ ); - utils::Assert( ishape[0] >= psize , "UnpackWindowToCol:series length smaller than patch size"); - this->i_channel_ = ishape[2] * ishape[1]; - this->i_length_ = ishape[0]; - // calculate number of batches - const index_t num = ishape.ProdShape( 3, srcdim ); - const index_t o_length = ( i_length_ - psize ) / pstride + 1; - this->shape_[0] = o_length * num; - this->shape_[1] = psize * i_channel_; - } - }; - - /*! - * \brief reverse operation of UnpackWindowToCol, used to backprop gradient back - * this is a version supporting multiple images - * \tparam Device which device it lies - * \tparam dstdim destination dimension - */ - template - struct PackColToWindowExp: public MakeTensorExp< PackColToWindowExp, Tensor, dstdim>{ - /*! \brief source operand */ - const Tensor& mat_; - /*! \brief patch size */ - index_t psize_; - /*! \brief patch stride */ - index_t pstride_; - /*! \brief constructor */ - PackColToWindowExp( const Tensor &mat, Shape ishape, index_t psize, index_t pstride ) - :mat_(mat), psize_(psize), pstride_(pstride){ - this->shape_ = ishape; - const index_t o_length = ( ishape[0] - psize ) / pstride + 1; - utils::Assert( mat.shape[0] == o_length * ishape.ProdShape(3,dstdim), "PackColToWindowExp: mat.shape[0] mismatch" ); - utils::Assert( mat.shape[1] == psize * ishape[1] * ishape[2], "PackColToWindowExp: mat.shape[1] mismatch" ); - } - }; - /*! * \brief reshape the content to another shape * input: Tensor: ishape @@ -465,43 +406,6 @@ namespace mshadow{ return PackColToPatchXExp( mat, imshape, psize_y, psize_x, pstride ); } - - /*! - * \brief unpack local (overlap) windows of time series to column of mat, can be used to implement 1D convolution - * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: - * - * weight; shape[1]: out_channel, shape[0]: ichannel*psize - * output; shape[1]: out_channel, shape[0]: out_length * num_of_series - * out_length = ( in_length - psize ) / pstride + 1 - * - * \return mat target matrix; shape[1]: in_channel*psize shape[0]: out_length * num_of_series - * \param series source series; shape[2]*shape[1]: in_channels, shape[0]: in_length, can be 3D or 4D tensor(multiple series), either dimension 1 or 2 can be used as channel - * \param psize size of each window - * \param pstride stride of each window - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline UnpackWindowToColExp::kDim > unpack_window2col( const Exp &series, index_t psize, index_t pstride ){ - TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return UnpackWindowToColExp::kDim >( series.self(), psize, pstride ); - } - - /*! - * \brief reverse operation of pack_col2window, can be used to implement deconvolution - * \return packed img expression - * \param mat source matrix - * \param tshape shape of target series - * \param psize size of each window - * \param pstride stride of each window - * \tparam Device the Device where input data lies - */ - template - inline PackColToWindowExp pack_col2window( const Tensor &mat, Shape tshape, index_t psize, index_t pstride ){ - utils::Assert( tshape[0] >= psize, "PackColToWindow:series length smaller than patch size"); - return PackColToWindowExp( mat, tshape, psize, pstride ); - } - /*! * \brief a expression that reshapes a tensor to another shape * \param src Tensor: @@ -831,60 +735,6 @@ namespace mshadow{ }; }; - namespace expr{ - template - struct Plan< UnpackWindowToColExp >{ - public: - Plan( const UnpackWindowToColExp &e ) - :src_(MakePlan(e.series_)),psize_(e.psize_), pstride_(e.pstride_), - i_channel_(e.i_channel_), i_length_(e.i_length_), - o_length_(( i_length_ - psize_) / pstride_+1 ){ - } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ - const index_t x_offset = i % psize_; - const index_t c = i / psize_; - const index_t x = ( j % o_length_ ) * pstride_ + x_offset; - const index_t n = j / o_length_; - - if( x < o_length_ ){ - return src_.Eval( n * i_channel_ + c , x ); - }else{ - return 0.0f; - } - } - private: - Plan src_; - const index_t psize_, pstride_, i_channel_, i_length_, o_length_; - }; - - template - struct Plan< PackColToWindowExp >{ - public: - Plan( const PackColToWindowExp &e ) - :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_), - i_channel_(e.shape_[2]), - o_length_(( e.shape_[0] - psize_ ) / pstride_ + 1){ - } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ - using namespace std; - const index_t c = i % i_channel_; - const index_t n = i / i_channel_; - const index_t x = j; - const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_; - const index_t px_max = min( (x+pstride_)/pstride_, o_length_ ); - - real_t res = 0.0f; - for( index_t px = px_min; px < px_max; ++px ){ - res += mat_[ c * psize_ + x - px*pstride_ ][ n * o_length_ +px ]; - } - return res; - } - private: - Tensor mat_; - const index_t psize_, pstride_, i_channel_, o_length_; - }; - }; - namespace expr{ template struct Plan< ReshapeExp >{ From 9f0fae83789d2aa615b0e091cbb063609c634da1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 20 Aug 2014 09:00:05 -0700 Subject: [PATCH 006/147] add v dot --- mshadow/tensor_cpu-inl.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mshadow/tensor_cpu-inl.hpp b/mshadow/tensor_cpu-inl.hpp index 0fa3cfa50306..4015afae8709 100644 --- a/mshadow/tensor_cpu-inl.hpp +++ b/mshadow/tensor_cpu-inl.hpp @@ -163,6 +163,15 @@ namespace mshadow { Softmax( dst[y], energy[y] ); } } + + inline real_t VDot( const Tensor& lhs, const Tensor& rhs ){ + utils::Assert( lhs.shape == rhs.shape, "VDot: shape mismatch" ); + real_t sum = 0.0f; + for( index_t x = 0; x < lhs.shape[0]; ++x ){ + sum += lhs[x] * rhs[x]; + } + return sum; + } }; // namespace mshadow #endif // TENSOR_CPU_INL_HPP From 87334a565df40306318d83fc00bf11b1300fc908 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 19 Oct 2014 16:46:03 -0700 Subject: [PATCH 007/147] quick potential fix --- mshadow/cuda/cuda_reduce.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mshadow/cuda/cuda_reduce.cuh b/mshadow/cuda/cuda_reduce.cuh index b7808a6ffa30..5f4e22a8124f 100644 --- a/mshadow/cuda/cuda_reduce.cuh +++ b/mshadow/cuda/cuda_reduce.cuh @@ -62,7 +62,8 @@ namespace mshadow{ // in warp optimization if( x_bits >= 5 ){ if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] ); - __MSHADOW_EMUSYNC__; + // for save, change it to 5 warp + __syncthreads(); } if( x_bits >= 4 ){ if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] ); From 471715efc7d6dc99b5160677f4de669198c04525 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 19 Oct 2014 17:12:15 -0700 Subject: [PATCH 008/147] fix cuda arch missing problem --- mshadow/cuda/cuda_reduce.cuh | 7 +++++-- mshadow/cuda/tensor_gpu-inl.cuh | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/mshadow/cuda/cuda_reduce.cuh b/mshadow/cuda/cuda_reduce.cuh index 5f4e22a8124f..393132ab438d 100644 --- a/mshadow/cuda/cuda_reduce.cuh +++ b/mshadow/cuda/cuda_reduce.cuh @@ -62,12 +62,15 @@ namespace mshadow{ // in warp optimization if( x_bits >= 5 ){ if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] ); - // for save, change it to 5 warp + #if __CUDA_ARCH__ < 200 __syncthreads(); + #else + __MSHADOW_EMUSYNC__; + #endif } if( x_bits >= 4 ){ if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] ); - __MSHADOW_EMUSYNC__; + __MSHADOW_EMUSYNC__; } if( x_bits >= 3 ){ if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] ); diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 3739db2cc4d0..61e477cf531b 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -10,8 +10,11 @@ namespace mshadow{ namespace cuda{ + #ifndef __CUDA_ARCH__ + #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" + #endif /* load unit for memory access */ - #if __CUDA_ARCH__>=200 + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 const int kMemUnitBits = 5; const int kMaxThreadsPerBlock = 1024; #else From 7da8ba3f7a1ff96281ea52d7bce06755ac2a3f01 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 5 Dec 2014 14:40:09 -0800 Subject: [PATCH 009/147] Update tensor.h add size --- mshadow/tensor.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 033540a9116c..ba666a54acbf 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -222,6 +222,16 @@ namespace mshadow { MSHADOW_XINLINE Tensor(const Shape &shape): shape(shape) {} /*! \brief constructor from data pointer and shape */ MSHADOW_XINLINE Tensor(real_t *dptr, const Shape &shape): dptr((real_t*)dptr), shape(shape) {} + /*! + * \brief return size of i-th dimension, start counting from highest dimension + * This meets the habit of normal usage of size of matrix. Note that mat.shape[0] gives lowest dimension, + * while mat.size(0) returns the highest dimension + * \param the dimension count from the highest dimensin + * \return the size + */ + MSHADOW_XINLINE index_t size(index_t i) const { + return shape_[dimension - 1 - i]; + } /*! * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together * \return tensor after flatten From f2eb4c27b6a032c1e35eb0098d48f3cdc8a0952d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 5 Dec 2014 14:41:16 -0800 Subject: [PATCH 010/147] Update tensor.h --- mshadow/tensor.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index ba666a54acbf..510fb3b3d191 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -302,6 +302,9 @@ namespace mshadow { return Tensor(reinterpret_cast \ (dptr) + begin, s); } + MSHADOW_XINLINE index_t size(index_t i) const { + return shape_[0]; + } MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; } MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; } public: From acac7311bfba5c53a4deb8717d1f4b9a1e10fc04 Mon Sep 17 00:00:00 2001 From: winsty Date: Wed, 10 Dec 2014 04:02:34 +0800 Subject: [PATCH 011/147] support transpose in chain operation. also update examples. --- example/basic.cpp | 15 ++++++++++++ mshadow/tensor.h | 13 +++++++++++ mshadow/tensor_expr_engine-inl.hpp | 37 ++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/example/basic.cpp b/example/basic.cpp index 53f85ae1a262..be26ccd3bb51 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -35,6 +35,21 @@ int main( void ){ } printf("\n"); } + + // create a tensor without explictly allocating spaces. + Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); + // transpose, and then add 1. + mat3 = mat.T() + 1; + + // index the shape using size(), this is more natural for MATLAB/numpy user. + printf("%u X %u matrix\n", mat3.size(0), mat3.size(1) ); + // print out matrix + for( index_t i = 0; i < mat3.size(0); i ++ ){ + for( index_t j = 0; j < mat3.size(1); j ++ ){ + printf("%.2f ", mat3[i][j]); + } + printf("\n"); + } // shutdown tensor enigne after usage ShutdownTensorEngine(); return 0; diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 42d13d3ec93e..d3979b7751a8 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -222,6 +222,16 @@ namespace mshadow { MSHADOW_XINLINE Tensor(const Shape &shape): shape(shape) {} /*! \brief constructor from data pointer and shape */ MSHADOW_XINLINE Tensor(real_t *dptr, const Shape &shape): dptr((real_t*)dptr), shape(shape) {} + /*! + * \brief return size of i-th dimension, start counting from highest dimension + * This meets the habit of normal usage of size of matrix. Note that mat.shape[0] gives lowest dimension, + * while mat.size(0) returns the highest dimension + * \param the dimension count from the highest dimensin + * \return the size + */ + MSHADOW_XINLINE index_t size(index_t i) const { + return shape[dimension - 1 - i]; + } /*! * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together * \return tensor after flatten @@ -292,6 +302,9 @@ namespace mshadow { return Tensor(reinterpret_cast \ (dptr) + begin, s); } + MSHADOW_XINLINE index_t size(index_t i) const { + return shape[0]; + } MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; } MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; } public: diff --git a/mshadow/tensor_expr_engine-inl.hpp b/mshadow/tensor_expr_engine-inl.hpp index 9c5f2c7f7a86..faf305bec57d 100644 --- a/mshadow/tensor_expr_engine-inl.hpp +++ b/mshadow/tensor_expr_engine-inl.hpp @@ -111,6 +111,17 @@ namespace mshadow{ Plan src_; }; + template + class Plan< TransposeExp >{ + public: + Plan( const Plan &src ):src_(src){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return src_.Eval( x, y ); + } + private: + Plan src_; + }; + // allow UnaryMap see the plan template inline Plan< BinaryMapExp > MakePlan( const BinaryMapExp &e ); @@ -125,6 +136,11 @@ namespace mshadow{ return Plan( e.self() ); } + template + inline Plan > MakePlan( const TransposeExp &e ){ + return Plan >( MakePlan(e.exp) ); + } + template inline Plan< T > MakePlan( const MakeTensorExp &e ){ return Plan< T >( e.real_self() ); @@ -159,6 +175,11 @@ namespace mshadow{ const static int kDim = 0; const static int kDevMask = 0xffff; }; + template + struct ExpInfo >{ + const static int kDim = ExpInfo::kDim; + const static int kDevMask = ExpInfo::kDevMask; + }; template struct ExpInfo< Tensor >{ const static int kDim = dim; @@ -224,6 +245,15 @@ namespace mshadow{ return shape; } }; + template + struct ShapeCheck >{ + inline static Shape Check( const TransposeExp< E > &e ){ + // swap the lowest two dimensions + Shape s = ShapeCheck::Check( e.exp ); + std::swap(s[0], s[1]); + return s; + } + }; template struct ShapeCheck >{ inline static Shape Check( const Tensor &t ){ @@ -411,6 +441,13 @@ namespace mshadow{ DotEngine::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ ); } }; + + template + struct ExpComplexEngine< SV, Device, dim, E >{ + inline static void Eval( Tensor &dst, const E &exp ){ + MapExp(dst, exp ); + } + }; }; // namespace expr }; #endif From c8742251826095d4633bfb62a3cfbb4fa99e7426 Mon Sep 17 00:00:00 2001 From: winsty Date: Fri, 12 Dec 2014 11:53:26 +0800 Subject: [PATCH 012/147] change TransposeExpr from kComplex to kMapper type --- example/basic.cpp | 5 +++-- mshadow/tensor_expr.h | 2 +- mshadow/tensor_expr_engine-inl.hpp | 7 ------- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/example/basic.cpp b/example/basic.cpp index be26ccd3bb51..5d3d9964a9f4 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -38,8 +38,9 @@ int main( void ){ // create a tensor without explictly allocating spaces. Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); - // transpose, and then add 1. - mat3 = mat.T() + 1; + Tensor mat4 = NewTensor(Shape2(2, 5), 1.0f); + // transpose, and then add mat4. + mat3 = mat.T() + mat4; // index the shape using size(), this is more natural for MATLAB/numpy user. printf("%u X %u matrix\n", mat3.size(0), mat3.size(1) ); diff --git a/mshadow/tensor_expr.h b/mshadow/tensor_expr.h index ac8fde79f1c6..39fae450a463 100644 --- a/mshadow/tensor_expr.h +++ b/mshadow/tensor_expr.h @@ -68,7 +68,7 @@ namespace mshadow{ /*! \brief represent a transpose expression of a container */ template - struct TransposeExp: public Exp< TransposeExp, type::kComplex >{ + struct TransposeExp: public Exp< TransposeExp, type::kMapper >{ public: /*! \brief expression to be transposed */ const EType &exp; diff --git a/mshadow/tensor_expr_engine-inl.hpp b/mshadow/tensor_expr_engine-inl.hpp index faf305bec57d..fe72b3e366ed 100644 --- a/mshadow/tensor_expr_engine-inl.hpp +++ b/mshadow/tensor_expr_engine-inl.hpp @@ -441,13 +441,6 @@ namespace mshadow{ DotEngine::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ ); } }; - - template - struct ExpComplexEngine< SV, Device, dim, E >{ - inline static void Eval( Tensor &dst, const E &exp ){ - MapExp(dst, exp ); - } - }; }; // namespace expr }; #endif From de53cb70f9cf21613a01df98ee227b767546e772 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 24 Dec 2014 04:36:21 -0800 Subject: [PATCH 013/147] change expression, let us make cpu version work today --- mshadow/base.h | 303 ++++++++++++++ mshadow/expression.h | 417 +++++++++++++++++++ mshadow/tensor.h | 945 +++++++++++++++++++++--------------------- mshadow/tensor_base.h | 298 ------------- mshadow/tensor_expr.h | 367 ---------------- mshadow/utils.h | 79 ++++ 6 files changed, 1272 insertions(+), 1137 deletions(-) create mode 100644 mshadow/base.h create mode 100644 mshadow/expression.h delete mode 100644 mshadow/tensor_base.h delete mode 100644 mshadow/tensor_expr.h create mode 100644 mshadow/utils.h diff --git a/mshadow/base.h b/mshadow/base.h new file mode 100644 index 000000000000..94aadb95c7b2 --- /dev/null +++ b/mshadow/base.h @@ -0,0 +1,303 @@ +#ifndef MSHADOW_BASE_H_ +#define MSHADOW_BASE_H_ +/*! + * \file base.h + * \brief definitions of base types, operators, macros functions + * + * \author Bing Xu, Tianqi Chen + */ +#include +#include +#include +#include +#include +// macro defintiions +/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */ +#ifndef MSHADOW_STAND_ALONE +#define MSHADOW_STAND_ALONE 1 +#endif +/*! \brief whether do padding during allocation */ +#ifndef MSHADOW_ALLOC_PAD +#define MSHADOW_ALLOC_PAD true +#endif +/*! + * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation + * for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64 + * set it to 0 then we will always allocate padded memory + */ +#ifndef MSHADOW_MIN_PAD_RATIO + #define MSHADOW_MIN_PAD_RATIO 2 +#endif + +#if MSHADOW_STAND_ALONE + #define MSHADOW_USE_CBLAS 0 + #define MSHADOW_USE_MKL 0 + #define MSHADOW_USE_CUDA 0 +#endif + +/*! \brief use CBLAS for CBLAS */ +#ifndef MSHADOW_USE_CBLAS + #define MSHADOW_USE_CBLAS 0 +#endif +/*! \brief use MKL for BLAS */ +#ifndef MSHADOW_USE_MKL + #define MSHADOW_USE_MKL 0 +#endif +/*! + * \brief use CUDA support, must ensure that the cuda include path is correct, + * or directly compile using nvcc + */ +#ifndef MSHADOW_USE_CUDA + #define MSHADOW_USE_CUDA 1 +#endif +/*! \brief use single precition float */ +#ifndef MSHADOW_SINGLE_PRECISION + #define MSHADOW_SINGLE_PRECISION 1 +#endif +/*! \brief whether use SSE */ +#ifndef MSHADOW_USE_SSE + #define MSHADOW_USE_SSE 0 +#endif +/*! \brief whether use NVML to get dynamic info */ +#ifndef MSHADOW_USE_NVML + #define MSHADOW_USE_NVML 0 +#endif +// SSE is conflict with cudacc +#ifdef __CUDACC__ + #undef MSHADOW_USE_SSE + #define MSHADOW_USE_SSE 0 +#endif + +#if MSHADOW_USE_CBLAS +extern "C" { + #include +} +#elif MSHADOW_USE_MKL + #include + #include + #include + #include +#endif + +#if MSHADOW_USE_CUDA + #include + #include +#endif + +#if MSHADOW_USE_NVML + #include +#endif +// -------------------------------- +// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code. +#ifdef MSHADOW_XINLINE + #error "MSHADOW_XINLINE must not be defined" +#endif +#ifdef __CUDACC__ + #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__ +#else + #define MSHADOW_XINLINE inline __attribute__((always_inline)) +#endif +/*! \brief cpu force inline */ +#define MSHADOW_CINLINE inline __attribute__((always_inline)) + +#if defined(__GXX_EXPERIMENTAL_CXX0X) ||\ + defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L + #define MSHADOW_CONSTEXPR constexpr +#else + #define MSHADOW_CONSTEXPR const +#endif + +/*! \brief namespace for mshadow */ +namespace mshadow { +/*! \brief buffer size for each random number generator */ +const unsigned kRandBufferSize = 1000000; +/*! \brief pi */ +const float kPi = 3.1415926f; +/*! \brief type that will be used for index */ +typedef unsigned index_t; +/*! \brief float point type that will be used in default by mshadow */ +typedef float default_real_t; + +/*! \brief namespace for operators */ +namespace op { +// binary operator +/*! \brief mul operator */ +struct mul{ + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a * b; + } +}; +/*! \brief plus operator */ +struct plus { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a + b; + } +}; +/*! \brief minus operator */ +struct minus { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a - b; + } +}; +/*! \brief divide operator */ +struct div { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a / b; + } +}; +/*! \brief get rhs */ +struct right { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return b; + } +}; +// unary operator/ function: example +// these operators can be defined by user, in the same style as binary and unary operator +// to use, simply write F( src ) +/*! \brief identity function that maps a real number to it self */ +struct identity{ + /*! \brief map a to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a) { + return a; + } +}; +} // namespace op +/*! \brief namespace for savers */ +namespace sv { +/*! \brief save to saver: = */ +struct saveto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a = b; + } + /*! \brief helper constant to use BLAS, alpha */ + MSHADOW_CONSTEXPR static default_real_t kAlphaBLAS = 1.0f; + /*! \brief helper constant to use BLAS, beta */ + MSHADOW_CONSTEXPR static default_real_t kBetaBLAS = 0.0f; + /*! \brief corresponding binary operator type */ + typedef op::right OPType; +}; +/*! \brief save to saver: += */ +struct plusto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a += b; + } + /*! \brief helper constant to use BLAS, alpha */ + MSHADOW_CONSTEXPR static default_real_t kAlphaBLAS = 1.0f; + /*! \brief helper constant to use BLAS, beta */ + MSHADOW_CONSTEXPR static default_real_t kBetaBLAS = 1.0f; + /*! \brief corresponding binary operator type */ + typedef op::plus OPType; +}; +/*! \brief minus to saver: -= */ +struct minusto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a -= b; + } + /*! \brief helper constant to use BLAS, alpha */ + MSHADOW_CONSTEXPR static default_real_t kAlphaBLAS = -1.0f; + /*! \brief helper constant to use BLAS, beta */ + MSHADOW_CONSTEXPR static default_real_t kBetaBLAS = 1.0f; + /*! \brief corresponding binary operator type */ + typedef op::minus OPType; +}; +/*! \brief multiply to saver: *= */ +struct multo { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a *= b; + } + /*! \brief corresponding binary operator type */ + typedef op::mul OPType; +}; +/*! \brief divide to saver: /= */ +struct divto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType& a, DType b) { + a /= b; + } + /*! \brief corresponding binary operator type */ + typedef op::div OPType; +}; +} // namespace sv +/*! \brief namespace for potential reducer operations */ +namespace red { +/*! \brief sum reducer */ +struct sum { + /*! \brief do reduction into dst */ + template + MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { + dst += src; + } + /*! + *\brief calculate gradient of redres with respect to redsrc, + * redres: reduced result, redsrc: one of reduction element + */ + template + MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) { + return 1; + } + template + MSHADOW_XINLINE static DType InitValue(void) { + return 0; + } +}; +/*! \brief helper namespace to get the limits */ +namespace limits { + template + MSHADOW_XINLINE DType MinValue(void); + template<> + MSHADOW_XINLINE float MinValue(void) { + return -FLT_MAX; + } + template<> + MSHADOW_XINLINE double MinValue(void) { + return -DBL_MAX; + } + template<> + MSHADOW_XINLINE int MinValue(void) { + return INT_MIN; + } +} // namespace limits +/*! \brief maximum reducer */ +struct maximum { + /*! \brief do reduction into dst */ + template + MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { + using std::max; + dst = max(dst, src); + } + /*! + * \brief calculate gradient of redres with respect to redsrc, + * redres: reduced result, redsrc: one of reduction element + */ + template + MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) { + return redres == redsrc ? 1: 0; + } + template + MSHADOW_XINLINE static DType InitValue(void) { + return limits::MinValue(); + } +}; +} // namespace red +} // namespace mshadow +#endif // MSHADOW_BASE_H_ diff --git a/mshadow/expression.h b/mshadow/expression.h new file mode 100644 index 000000000000..554997df85b6 --- /dev/null +++ b/mshadow/expression.h @@ -0,0 +1,417 @@ +#ifndef MSHADOW_EXPRESSION_H_ +#define MSHADOW_EXPRESSION_H_ +/*! + * \file expression.h + * \brief definitions of abstract expressions and expressions template + * \author Tianqi Chen, Bing Xu + */ +#include "./base.h" + +namespace mshadow { +/*! + * \brief namespace for abstract expressions and expressions template, + * have no dependecy on tensor.h, + * These data structure takes no charge in computations, + * they are only used to define operations and represent expression in a symbolic way + */ +namespace expr { +/*! \brief type of expressions */ +namespace type { +// type expression type are defined as bitmask +// subtype relationshop kRValue < kMapper < kPull < kComplex +/*! \brief this expression directly correspnds to a data class, can be used to assign data */ +const int kRValue = 0; +/*! \brief expression contains element-wise tensor operations, map a expression to same shape */ +const int kMapper = 1; +/*! + * \brief expression that can be chained with other expressiones + * Usually it have function Eval(i,j) defined, which pulls the result (i, j) from input + * expression and output the result at certain position. + */ +const int kChainer = 3; +/*! \brief othercase: e.g dot product */ +const int kComplex = 7; +} // namespace type +/*! + * \brief expression engine that actually interprets these expressions + * this is a function template that needed to be implemented for specific expressions + * \tparam Saver the save method + * \tparam RValue the type of RValue to be saved + * \sa namespace sv + */ +template +struct ExpEngine { + /*! \brief defines how expression exp can be evaluated and stored into dst */ + template + inline static void Eval(RValue& dst, const EType &exp); +}; +/*! + * \brief base class for expression + * \tparam SubType inheritated class must put their type into this parameter + * \tparam exp_type expression type, see namespace type + */ +template +struct Exp { + public: + /*! \return subtype instance of current class */ + inline const SubType& self(void) const { + return *static_cast(this); + } + /*! \return reference of subtype instance of current class */ + inline SubType& refself(void) { + return *static_cast(this); + } +}; +/*! + * \brief scalar expression + * \tparam DType the data type of the scalar + */ +template +struct ScalarExp: public Exp, type::kMapper> { + /*! \brief scalar value */ + DType scalar_; + /*! \brief constructor, must be implicit for implicit conversion */ + ScalarExp(DType scalar) : scalar_(scalar) {} +}; +/*! \brief create an scalar expression */ +template +inline ScalarExp scalar(DType s) { + return ScalarExp(s); +} +/*! + * \brief typecast expression, cast the type of elements + * \tparam DType the target type we want to cast into + * \tparam EType the type of the source expression + * \tparam etype the type of expression after cast + */ +template +struct TypecastExp: public Exp, etype> { + const EType &exp; + /*! \brief constructor */ + explicit TypecastExp(const EType &e) : exp(e) {} +}; +/*! \brief create an scalar expression */ +template +inline TypecastExp tcast(const Exp &exp) { + return TypecastExp(exp.self()); +} +/*! \brief represent a transpose expression of a container */ +template +struct TransposeExp: public Exp, type::kChainer> { + /*! \brief expression to be transposed */ + const EType &exp; + /*! \brief constructor */ + explicit TransposeExp(const EType &e) : exp(e) {} + /*! \brief transpose expression */ + inline const EType &T(void) const { + return exp; + } +}; +/*! + * \brief base class of all rvalues + * \tparam Container the actually class of data container, e.g. Tensor1D + * \tparam DataType the element data type of each element in the container + */ +template +class RValueExp: public Exp { + public: + /*! + *\brief transpose of a matrix + *\return transpose of current expression + */ + inline const TransposeExp T(void) const { + return TransposeExp(this->self()); + } + /*! \brief operator overload */ + inline Container &operator+=(default_real_t s) { + ExpEngine::Eval(this->refself(), scalar(s)); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &operator-=(default_real_t s) { + ExpEngine::Eval(this->refself(), scalar(s)); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &operator*=(default_real_t s) { + ExpEngine::Eval(this->refself(), scalar(s)); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &operator/=(default_real_t s) { + ExpEngine::Eval(this->refself(), scalar(s)); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &__assign(default_real_t s) { + ExpEngine::Eval(this->refself(), scalar(s)); + return this->refself(); + } + /*! \brief implementation of operator=, note that we can not define container = container */ + template + inline Container &__assign(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } + /*! \brief implementation of operator=, note that we can not define conatiner = container */ + template + inline Container &__assign(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } + /*! \brief implementation of operator=, note that we can not define container = container */ + template + inline Container &__assign(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } + /*! \brief implementation of operator+= */ + template + inline Container &operator+=(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } + /*! \brief implementation of operator-= */ + template + inline Container &operator-=(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } + /*! \brief implementation of operator*= */ + template + inline Container &operator*=(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } + /*! \brief implementation of operator/= */ + template + inline Container &operator/=(const Exp &exp) { + ExpEngine::Eval(this->refself(), exp.self()); + return this->refself(); + } +}; +/*! + * \brief matrix multiplication expression dot(lhs[.T], rhs[.T]) + * \tparam TA type of lhs + * \tparam TB type of rhs + * \tparam ltrans whether lhs is transposed + * \tparam rtrans whether rhs is transposed + */ +template +struct DotExp: public Exp, type::kComplex> { + /*! \brief left operand */ + const TA &lhs_; + /*! \brief right operand */ + const TB &rhs_; + /*! \brief scale over result */ + default_real_t scale_; + /*! \brief constructor */ + explicit DotExp(const TA &lhs, const TB &rhs, default_real_t scale) + : lhs_(lhs), rhs_(rhs), scale_(scale) {} +}; +// definition of dot expression +/*! \brief dot operator def */ +template +inline DotExp +dot(const RValueExp &lhs, const RValueExp &rhs) { + return DotExp(lhs.self(), rhs.self(), 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +dot(const TransposeExp &lhs, const RValueExp &rhs) { + return DotExp(lhs.exp, rhs.self(), 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +dot(const RValueExp &lhs, const TransposeExp &rhs) { + return DotExp(lhs.self(), rhs.exp, 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +dot(const TransposeExp &lhs, const TransposeExp &rhs) { + return DotExp(lhs.exp, rhs.exp, 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +operator*(const DotExp &lhs, default_real_t rhs) { + return DotExp(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs); +} +/*! \brief scale of dot operation */ +template +inline DotExp +operator*(default_real_t lhs, const DotExp &rhs) { + return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); +} +//--------------- +// BinaryMapExp +// -------------- +/*! + * \brief binary map expression lhs [op] rhs + * \tparam OP operator + * \tparam TA type of lhs + * \tparam TB type of rhs + * \tparam etype expression type, sa namespace::type + */ +template +struct BinaryMapExp: public Exp< BinaryMapExp, etype> { + /*! \brief left operand */ + const TA &lhs_; + /*! \brief right operand */ + const TB &rhs_; + /*! \brief constructor */ + explicit BinaryMapExp(const TA &lhs, const TB &rhs) + :lhs_(lhs), rhs_(rhs) {} +}; + +/*! \brief make expression */ +template +inline BinaryMapExp +MakeExp(const Exp &lhs, const Exp &rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); +} +/*! + * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression + * \param lhs left operand + * \param rhs right operand + * \tparam binary operator + * \tparam TA lhs expression + * \tparam ta lhs expression type + * \tparam TB rhs expression + * \tparam tb rhs expression type + * \sa mshadow::op + */ +template +inline BinaryMapExp +F(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload for const */ +template +inline BinaryMapExp, (ta|type::kMapper)> +F(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload for const */ +template +inline BinaryMapExp, TB, (tb|type::kMapper)> +F(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +// operator rules +/*! \brief operator overload */ +template +inline BinaryMapExp +operator+(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp +operator-(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp +operator*(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp +operator/(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +// constant operators +/*! \brief operator overload */ +template +inline BinaryMapExp, (ta|type::kMapper)> +operator+(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, (ta|type::kMapper)> +operator-(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, (ta|type::kMapper)> +operator*(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, (ta|type::kMapper)> +operator/(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +// constant operators 2 +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, (tb|type::kMapper)> +operator+(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, (tb|type::kMapper)> +operator-(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, (tb|type::kMapper)> +operator*(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, (tb|type::kMapper)> +operator/(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +//--------------- +// UnaryMapExp +// -------------- +/*! + * \brief unary map expression op(src) + * \tparam OP operator + * \tparam TA type of src + * \tparam etype expression type, sa namespace::type + */ +template +struct UnaryMapExp: public Exp< UnaryMapExp, etype> { + /*! \brief source expression */ + const TA &src_; + /*! \brief constructor */ + explicit UnaryMapExp(const TA &src) : src_(src) {} +}; + +/*! \brief make expression */ +template +inline UnaryMapExp MakeExp(const Exp &src) { + return UnaryMapExp(src.self()); +} +/*! + * \brief short hand for MakeExp, usage F(src), create a unary operation expression + * \param src source expression + * \tparam operator + * \tparam TA source expression + * \tparam ta source expression type + * \sa mshadow::op + */ +template +inline UnaryMapExp F(const Exp &src) { + return MakeExp(src); +} + + +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXPRESSION_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index d3979b7751a8..87b31192a502 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -1,485 +1,486 @@ -#ifndef MSHADOW_TENSOR_H -#define MSHADOW_TENSOR_H +#ifndef MSHADOW_TENSOR_H_ +#define MSHADOW_TENSOR_H_ /*! * \file tensor.h * \brief header file of tensor data structure and functions - * covention: this lib requires explicit memory allocation and de-allocation - * all the data structure Tensor, Tensor are like handles(pointers), - * no memory allocation is happening during calculation + * This lib requires explicit memory allocation and de-allocation + * all the data structure Tensor, Tensor are like handles(pointers), + * no memory allocation is happening during calculation + * + * For STL style tensor, see tensor_container.h * \author Bing Xu, Tianqi Chen */ -#include "tensor_base.h" -#include "tensor_expr.h" +#include "./base.h" +#include "./expression.h" namespace mshadow { - /*! - * \brief shape of a tensor - * IMPORTANT NOTE: this shape is different from numpy.shape - * shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension - * shape[k] corresponds to k-th dimension of tensor - * \tparam dimension dimension of tensor - */ - template - struct Shape { - public: - /*! \brief maximum dimension of tensor */ - const static int kMaxShape = dimension; - /*! \brief maximum dimension minus 1 */ - const static int kSubShape = dimension - 1; - public: - /*! \brief default constructor, do nothing */ - MSHADOW_XINLINE Shape(void) {} - /*! \brief constuctor */ - MSHADOW_XINLINE Shape( const Shape &s ){ - #pragma unroll - for( int i = 0; i < kMaxShape; ++i ){ - this->shape_[i] = s[i]; - } - this->stride_ = s.stride_; - } - /*! - * \brief get corresponding index - * \param idx dimension index - * \return the corresponding dimension size - */ - MSHADOW_XINLINE index_t& operator[](index_t idx) { - return shape_[ idx ]; - } - /*! - * \brief get corresponding index - * \param idx dimension index - * \return the corresponding dimension size - */ - MSHADOW_XINLINE const index_t& operator[](index_t idx) const { - return shape_[ idx ]; - } - /*! \return whether two shape equals */ - MSHADOW_XINLINE bool operator==(const Shape &s) const { - #pragma unroll - for ( int i = 0; i < kMaxShape; ++i ) { - if (s.shape_[i] != this->shape_[i]) return false; - } - return true; - } - /*! - * flatten the higher dimension to second dimension, return a 2D shape - * \return the flat 2d shape - */ - MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { - Shape<2> s; - s.stride_ = this->stride_; - s.shape_[ 0 ] = this->shape_[ 0 ]; - index_t ymax = 1; - - #pragma unroll - for (int i = 1; i < kMaxShape; ++i) { - ymax *= this->shape_[ i ]; - } - s.shape_[1] = ymax; - return s; - } - /*! \return number of valid elements */ - MSHADOW_XINLINE size_t Size(void) const{ - size_t memsz = this->shape_[ 0 ]; - #pragma unroll - for (int i = 1; i < kMaxShape; ++i) { - memsz *= this->shape_[ i ]; - } - return memsz; - } - /*! \return memory size, including the aligned x dimension */ - MSHADOW_XINLINE size_t MSize(void) const { - size_t memsz = this->stride_; - #pragma unroll - for (int i = 1; i < kMaxShape; ++i) { - memsz *= this->shape_[ i ]; - } - return memsz; - } - /*! - * \return product shape in [dimstart,dimend) - * \param dimstart start dimension - * \param dimend end dimension - */ - MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{ - index_t num = 1; - #pragma unroll - for (int i = dimstart; i < dimend; ++i) { - num *= this->shape_[ i ]; - } - return num; - } - /*! - * \brief get subshape - * \return subshape - */ - MSHADOW_XINLINE Shape SubShape(void) const { - Shape s; - s.stride_ = this->stride_; - // for cuda - #pragma unroll - for (int i = 0; i < kSubShape; ++i) { - s.shape_[ i ] = this->shape_[ i ]; - } - return s; - } - - public: - /*! \brief storing the dimension information */ - index_t shape_[ kMaxShape ]; - /*! - * \brief storing the stride information in x dimension - * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency - */ - index_t stride_; - }; - // useful construction functions to generate shape - /*! - * \brief construct a one dimension shape, stride will equal s0 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){ - Shape<1> s; s[0] = s0; s.stride_ = s0; - return s; +/*! \brief device name CPU */ +struct cpu { + /*! \brief whether this device is CPU or not */ + static const bool kDevCPU = true; + /*! \brief device flag number, identifies this device */ + static const int kDevMask = 1<<0; +}; +/*! \brief device name CPU */ +struct gpu { + /*! \brief whether this device is CPU or not */ + static const bool kDevCPU = false; + /*! \brief device flag number, identifies this device */ + static const int kDevMask = 1<<1; +}; +/*! + * \brief shape of a tensor + * IMPORTANT NOTE: this shape is different from numpy.shape + * shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension + * shape[k] corresponds to k-th dimension of tensor + * \tparam dimension dimension of tensor + */ +template +struct Shape { + /*! \brief dimension of current shape */ + static const int kDimension = dimension; + /*! \brief dimension of current shape minus one */ + static const int kSubdim = dimension - 1; + /*! \brief storing the dimension information */ + index_t shape_[kDimension]; + /*! + * \brief storing the stride information in x dimension + * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency + */ + index_t stride_; + /*! \brief default constructor, do nothing */ + MSHADOW_XINLINE Shape(void) {} + /*! \brief constuctor */ + MSHADOW_XINLINE Shape(const Shape &s) { + #pragma unroll + for (int i = 0; i < kDimension; ++i) { + this->shape_[i] = s[i]; } - /*! - * \brief construct a two dimension shape, stride will equal s0 - * \param s1 size of dimension 1 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){ - Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0; - return s; + this->stride_ = s.stride_; + } + /*! + * \brief get corresponding index + * \param idx dimension index + * \return the corresponding dimension size + */ + MSHADOW_XINLINE index_t &operator[](index_t idx) { + return shape_[idx]; + } + /*! + * \brief get corresponding index + * \param idx dimension index + * \return the corresponding dimension size + */ + MSHADOW_XINLINE const index_t &operator[](index_t idx) const { + return shape_[idx]; + } + /*! \return whether two shape equals */ + MSHADOW_XINLINE bool operator==(const Shape &s) const { + #pragma unroll + for (int i = 0; i < kDimension; ++i) { + if (s.shape_[i] != this->shape_[i]) return false; } - /*! - * \brief construct a three dimension shape, stride will equal s0 - * \param s2 size of dimension 2 - * \param s1 size of dimension 1 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){ - Shape<3> s; - s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0; - return s; + return true; + } + /*! + * flatten the higher dimension to second dimension, return a 2D shape + * \return the flat 2d shape + */ + MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { + Shape<2> s; + s.stride_ = this->stride_; + s.shape_[1] = this->shape_[kDimension - 1]; + index_t ymax = 1; + #pragma unroll + for (int i = 0; i < kDimension - 1; ++i) { + ymax *= this->shape_[i]; } - /*! - * \brief construct a four dimension shape, stride will equal s0 - * \param s3 size of dimension 3 - * \param s2 size of dimension 2 - * \param s1 size of dimension 1 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){ - Shape<4> s; - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0; - return s; + s.shape_[0] = ymax; + return s; + } + /*! \return number of valid elements */ + MSHADOW_XINLINE size_t Size(void) const { + size_t size = this->shape_[0]; + #pragma unroll + for (int i = 1; i < kDimension; ++i) { + size *= this->shape_[i]; } -}; // namespace mshadow - -namespace mshadow { - /*! \brief device name CPU */ - struct cpu { - /*! \brief whether this device is CPU or not */ - const static bool kDevCPU = true; - /*! \brief device flag number, identifies this device */ - const static int kDevMask = 1<<0; - }; - /*! \brief device name CPU */ - struct gpu { - /*! \brief whether this device is CPU or not */ - const static bool kDevCPU = false; - /*! \brief device flag number, identifies this device */ - const static int kDevMask = 1<<1; - }; - - // more compact template - /*! - * \brief general tensor - * \tparam Device which device the tensor is on - * \tparam dimension dimension of the tensor - */ - template - struct Tensor: public expr::ContainerExp< Tensor >{ - public: - /*! \brief whether current type lies in cpu */ - const static bool kDevCPU = Device::kDevCPU; - /*! \brief dimension of subtype */ - const static int kSubdim = dimension - 1; - - public: - /*! \brief pointer to the data */ - real_t *dptr; - /*! \brief shape of the tensor */ - Shape shape; - public: - /*! \brief default constructor */ - MSHADOW_XINLINE Tensor(void) {} - /*! \brief constructor from shape */ - MSHADOW_XINLINE Tensor(const Shape &shape): shape(shape) {} - /*! \brief constructor from data pointer and shape */ - MSHADOW_XINLINE Tensor(real_t *dptr, const Shape &shape): dptr((real_t*)dptr), shape(shape) {} - /*! - * \brief return size of i-th dimension, start counting from highest dimension - * This meets the habit of normal usage of size of matrix. Note that mat.shape[0] gives lowest dimension, - * while mat.size(0) returns the highest dimension - * \param the dimension count from the highest dimensin - * \return the size - */ - MSHADOW_XINLINE index_t size(index_t i) const { - return shape[dimension - 1 - i]; - } - /*! - * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together - * \return tensor after flatten - */ - MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(reinterpret_cast \ - (dptr), shape.FlatTo2D()); - } - /*! - * \brief get a element of dimension - 1 - * \param idx index - * \return the result tensor - */ - MSHADOW_XINLINE Tensor operator[](index_t idx) const { - Shape s = shape.SubShape(); - return Tensor(reinterpret_cast \ - (dptr) + s.MSize() * idx, s); - } - /*! - * \brief slice the tensor in highest dimension [begin,end) - * \param begin begin position of slice - * \param end end position of slice - * \return tensor after slice - */ - MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { - Shape s = this->shape; - s[ dimension - 1 ] = end - begin; - return Tensor(reinterpret_cast\ - (dptr) + s.SubShape().MSize() * begin, s); - } - public: - /*!\brief functions to fit expression template */ - inline Tensor& operator=( real_t s ){ - return this->__assign( s ); - } - /*!\brief functions to fit expression template */ - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - /*!\brief functions to fit expression template */ - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - }; - - /* - * respecialized class Tensor1D,thei is due to different implementation in operator[] - */ - template - struct Tensor: public expr::ContainerExp< Tensor >{ - public: - real_t *dptr; - Shape<1> shape; - public: - MSHADOW_XINLINE Tensor(void) {} - MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {} - MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {} - - MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(reinterpret_cast \ - (dptr), shape.FlatTo2D()); - } - MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { - Shape<1> s; - s[0] = s.stride_ = end - begin; - return Tensor(reinterpret_cast \ - (dptr) + begin, s); - } - MSHADOW_XINLINE index_t size(index_t i) const { - return shape[0]; - } - MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; } - MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; } - public: - // functions to fit expression template - inline Tensor& operator=( double s ){ - return this->__assign( s ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - }; -}; // namespace mshadow - -// add unroll loops for the shape -namespace mshadow { - // function declarations - /*! - * \brief initialize tensor engine, used to call intialization functions of dependent libs - * this function should be called before all GPU tensor operations, - * for using tensors in CPU, this call is actually not needed - * \param device_id GPU device id to be choosed - */ - inline void InitTensorEngine( int device_id=0 ); - /*! - * \brief Shutdown tensor engine, - * this function should be called after all GPU tensor operations, - * for using tensors in CPU, this call is actually not needed - */ - inline void ShutdownTensorEngine( void ); - - /*! - * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj - * this function is responsible to set the stride_ in each obj.shape - * \tparam dim specify the dim of tensor - * \param obj the tensor object, with shape specified - * \param pad whether padding dimension 0, to make last dimension aligned, - * padding may help improve efficiency of matrix multiplications - * if true, will allocate space with stride_ that may not equals shape[0] - * if false, will allocate continuous space - */ - template - inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); - /*! \brief refer to comment of cpu ver \sa AllocSpace */ - template - inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); - - /*! - * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL - * \tparam dim specify the dim of tensor - * \param obj the tensor object - */ - template - inline void FreeSpace(Tensor &obj); - /*! \brief refer to comment of cpu ver \sa FreeSpace */ - template - inline void FreeSpace(Tensor &obj); - - /*! - * \brief CPU/GPU: short cut to allocate and initialize a Tensor - * \tparam Device device of tensor - * \tparam dim dimention of tensor - * \param shape: shape of tensor - * \param initv: initialization value - * \param pad : padding option - * \sa AllocSpace - */ - template - inline Tensor NewTensor(const Shape &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD); - - /*! - * \brief copy data from one tensor to another, with same shape - * \tparam dim specify the dim of tensor - * \param dst target tensor - * \param src source tensor - */ - template - inline void Copy(Tensor dst, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa Copy */ - template - inline void Copy(Tensor dst, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa Copy */ - template - inline void Copy(Tensor dst, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa Copy */ - template - inline void Copy(Tensor dst, const Tensor &src ); - - - /*! - * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) ) - * \param dst destination - * \param energy input energy - */ - inline void Softmax( Tensor dst, const Tensor &energy ); - /*! \brief refer to comment of cpu ver \sa Softmax */ - inline void Softmax( Tensor dst, const Tensor &energy ); - -}; // namespace mshadow - - -namespace mshadow{ - // function declarations to support expression, no need to understand them - // these functions do not need to be directly used - - /*! - * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan - * \tparam Saver specify storage method - * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter - * \tparam E specifies the expression type, not need to specify this parameter during usage - * \tparam etype expression type - * \param dst destination - * \param exp expression - * \sa namespace mshadow:sv, mshadow::op, mshadow::expr - */ - template - inline void MapExp(Tensor dst, const expr::Exp &exp ); - /*! \brief refer to comment of cpu ver \sa MapExp */ - template - inline void MapExp(Tensor dst, const expr::Exp &exp ); - - /*! - * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) - * \tparam Saver specify storage method - * \tparam Reducer specify a reducer method - * \tparam E specifies the expression type, not need to specify this parameter during usage - * \tparam etype expression type - * \param dst destination - * \param exp expression - * \param scale scale the result before save - * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr - */ - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - - - /*! - * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) - * \tparam Saver specify storage method - * \tparam Reducer specify a reducer method - * \tparam E specifies the expression type, not need to specify this parameter during usage - * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest - * \tparam etype expression type - * \param dst destination - * \param exp expression - * \param scale scale the result before save - * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr - */ - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - -};// namespace mshadow + return size; + } + /*! \return memory size, including the aligned x dimension */ + MSHADOW_XINLINE size_t MSize(void) const { + size_t memsz = this->stride_; + #pragma unroll + for (int i = 0; i < kDimension - 1; ++i) { + memsz *= this->shape_[i]; + } + return memsz; + } + /*! + * \return product shape in [dimstart,dimend) + * \param dimstart start dimension + * \param dimend end dimension + */ + MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const { + index_t num = 1; + #pragma unroll + for (int i = dimstart; i < dimend; ++i) { + num *= this->shape_[i]; + } + return num; + } + /*! + * \brief get subshape that takes off largest dimension +v * \return subshape + */ + MSHADOW_XINLINE Shape SubShape(void) const { + Shape s; + s.stride_ = this->stride_; + // for cuda + #pragma unroll + for (int i = 0; i < kSubdim; ++i) { + s.shape_[i] = this->shape_[i + 1]; + } + return s; + } +}; // Shape +//------------------------------------------------ +// useful construction functions to generate shape +//------------------------------------------------- +/*! + * \brief construct a one dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<1> Shape1(index_t s0) { + Shape<1> s; s[0] = s0; s.stride_ = s0; + return s; +} +/*! + * \brief construct a two dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \param s1 size of dimension 1 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) { + Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s1; + return s; +} +/*! + * \brief construct a three dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \param s1 size of dimension 1 + * \param s2 size of dimension 2 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) { + Shape<3> s; + s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s2; + return s; +} +/*! + * \brief construct a four dimension shape, stride will equal s0 + * \param s3 size of dimension 3 + * \param s2 size of dimension 2 + * \param s1 size of dimension 1 + * \param s0 size of dimension 0 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<4> Shape4(index_t s3, index_t s2, index_t s1, index_t s0) { + Shape<4> s; + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s3; + return s; +} +/*! + * \brief computaion stream structure, used for asynchronize computation + */ +template +struct Stream { +}; +/*! + * \brief Tensor RValue, this is the super type of all kinds of possible tensors + * \tparam Container the tensor type + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + * \tparam DType the type of elements in the tensor + */ +template +struct TRValue: public expr::RValueExp { +}; +// more compact template +/*! + * \brief general tensor + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + * \tparam DType the type of elements in the tensor + */ +template +struct Tensor: public TRValue, Device, dimension, DType> { + public: + //-------------------------------- + // struct memembers + //-------------------------------- + /*! \brief whether current type lies in cpu */ + static const bool kDevCPU = Device::kDevCPU; + /*! \brief dimension of subtype */ + static const int kSubdim = dimension - 1; + //-------------------------------- + // struct memembers + //-------------------------------- + /*! \brief pointer to the data */ + DType *dptr; + /*! \brief shape of the tensor */ + Shape shape; + /*! + * \brief stream where the computation lies + * stream is a device dependency concept where each computation + */ + Stream *stream; + //-------------------------------- + // functions + //-------------------------------- + /*! \brief default constructor */ + MSHADOW_XINLINE Tensor(void) : stream(NULL) {} + /*! \brief constructor from shape */ + MSHADOW_XINLINE Tensor(const Shape &shape) : shape(shape), stream(NULL) {} + /*! \brief constructor from data pointer and shape */ + MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape) + : dptr(dptr), shape(shape), stream(NULL) {} + /*! + * \brief return size of i-th dimension, start counting from highest dimension + * \param the dimension count from the highest dimensin + * \return the size + */ + MSHADOW_XINLINE index_t size(index_t i) const { + return shape[i]; + } + /*! + * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together + * \return tensor after flatten + */ + MSHADOW_XINLINE Tensor FlatTo2D(void) const { + return Tensor(dptr, shape.FlatTo2D()); + } + /*! + * \brief get a element of dimension - 1 + * \param idx index + * \return the result tensor + */ + MSHADOW_XINLINE Tensor operator[](index_t idx) const { + Shape s = shape.SubShape(); + return Tensor(dptr + s.MSize() * idx, s); + } + /*! + * \brief slice the tensor in highest dimension [begin,end) + * \param begin begin position of slice + * \param end end position of slice + * \return tensor after slice + */ + MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { + Shape s = this->shape; + s[0] = end - begin; + return Tensor(dptr + s.SubShape().MSize() * begin, s); + } + /*!\brief functions to fit expression template */ + inline Tensor &operator=(default_real_t s) { + return this->__assign(s); + } + /*!\brief functions to fit expression template */ + template + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + /*!\brief functions to fit expression template */ + template + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + /*!\brief functions to fit expression template */ + template + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } +}; +/* + * respecialized class Tensor1D, thei is due to different implementation in operator[] + */ +template +struct Tensor: public expr::RValueExp, DType> { + public: + DType *dptr; + Shape<1> shape; + Stream *stream; + // constructor + MSHADOW_XINLINE Tensor(void) : stream(NULL) {} + MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape), stream(NULL) {} + MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) + : dptr(dptr), shape(shape), stream(NULL) {} + MSHADOW_XINLINE Tensor FlatTo2D(void) const { + return Tensor(dptr, shape.FlatTo2D()); + } + MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { + Shape<1> s; + s[0] = s.stride_ = end - begin; + return Tensor(dptr + begin, s); + } + MSHADOW_XINLINE index_t size(index_t i) const { + return shape[0]; + } + MSHADOW_XINLINE DType &operator[](index_t idx) { return dptr[idx]; } + MSHADOW_XINLINE const DType &operator[](index_t idx)const { return dptr[idx]; } + // functions to fit expression template + inline Tensor &operator=(double s) { + return this->__assign(s); + } + template + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + template + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + template + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } +}; +//------------------------ +// Function Declarations +//----------------------- +/*! + * \brief initialize tensor engine, used to call intialization functions of dependent libs + * this function should be called before all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + * \param device_id GPU device id to be choosed + */ +inline void InitTensorEngine(int device_id = 0); +/*! + * \brief Shutdown tensor engine, + * this function should be called after all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + */inline void ShutdownTensorEngine(void); -// execution implementation of expression evaluations -#include "tensor_expr_engine-inl.hpp" -// cpu implementation of functions -#include "tensor_cpu-inl.hpp" -// gpu implementation of functions -#include "tensor_gpu-inl.hpp" -// extension of expressions -#include "tensor_expr_ext.h" -// io -#include "tensor_io.h" -// container -#include "tensor_container.h" -// random number generator -#include "tensor_random.h" +/*! + * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj + * this function is responsible to set the stride_ in each obj.shape + * \param obj the tensor object, with shape specified + * \param pad whether padding dimension 0, to make last dimension aligned, + * padding may help improve efficiency of matrix multiplications + * if true, will allocate space with stride_ that may not equals shape[0] + * if false, will allocate continuous space + * \tparam dim specify the dim of tensor + * \tparam DType type of element in tensor + */ +template +inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); +/*! \brief refer to comment of cpu ver \sa AllocSpace */ +template +inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); +/*! + * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL + * \param obj the tensor object + * \tparam dim specify the dim of tensor + * \tparam DType type of element in tensor + */ +template +inline void FreeSpace(Tensor &obj); +/*! \brief refer to comment of cpu ver \sa FreeSpace */ +template +inline void FreeSpace(Tensor &obj); +/*! + * \brief CPU/GPU: short cut to allocate and initialize a Tensor + * \param shape: shape of tensor + * \param initv: initialization value + * \param pad : padding option + * \tparam Device device of tensor + * \tparam DType type of element in tensor + * \tparam dim dimention of tensor + * \sa AllocSpace + */ +template +inline Tensor NewTensor(const Shape &shape, DType initv, bool pad = MSHADOW_ALLOC_PAD); +/*! + * \brief copy data from one tensor to another, with same shape + * \param dst target tensor + * \param src source tensor + * \tparam dim specify the dim of tensor + * \tparam DType type of element in tensor + */ +template +inline void Copy(Tensor dst, const Tensor &src); +/*! \brief refer to comment of cpu ver \sa Copy */ +template +inline void Copy(Tensor dst, const Tensor &src); +/*! \brief refer to comment of cpu ver \sa Copy */ +template +inline void Copy(Tensor dst, const Tensor &src); +/*! \brief refer to comment of cpu ver \sa Copy */ +template +inline void Copy(Tensor dst, const Tensor &src); +/*! + * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) + * \param dst destination + * \param energy input energy + */ +inline void Softmax(Tensor dst, const Tensor &energy); +/*! \brief refer to comment of cpu ver \sa Softmax */ +inline void Softmax(Tensor dst, const Tensor &energy); +// function declarations to support expression, no need to understand them +// these functions do not need to be directly used +/*! + * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan + * \tparam Saver specify storage method + * \tparam R specifies the storage type of the tensor + * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter + * \tparam DType the type of elements in the tensor + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \sa namespace mshadow:sv, mshadow::op, mshadow::expr + */ +template +inline void MapExp(TRValue dst, const expr::Exp &exp); +/*! \brief refer to comment of cpu ver \sa MapExp */ +template +inline void MapExp(TRValue dst, const expr::Exp &exp); +/*! + * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) + * \tparam Saver specify storage method + * \tparam Reducer specify a reducer method + * \tparam R specifies the storage type of the tensor + * \tparam DType the type of elements in the tensor + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \param scale scale the result before save + * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr + */ +template +inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); +/*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ +template +inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); +/*! + * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) + * \tparam Saver specify storage method + * \tparam Reducer specify a reducer method + * \tparam R specifies the storage type of the tensor + * \tparam DType the type of elements in the tensor + * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \param scale scale the result before save + * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr + */ +template +inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); +/*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ +template +inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); +} // namespace mshadow #endif // TENSOR_H diff --git a/mshadow/tensor_base.h b/mshadow/tensor_base.h deleted file mode 100644 index fe09960d2445..000000000000 --- a/mshadow/tensor_base.h +++ /dev/null @@ -1,298 +0,0 @@ -#ifndef MSHADOW_TENSOR_BASE_H -#define MSHADOW_TENSOR_BASE_H -/*! - * \file tensor_base.h - * \brief definitions of base types, macros functions - * - * \author Bing Xu, Tianqi Chen - */ -#include -#include -#include -#include -#include -// macro defintiions - -/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */ -#ifndef MSHADOW_STAND_ALONE - #define MSHADOW_STAND_ALONE 0 -#endif - -/*! \brief whether do padding during allocation */ -#ifndef MSHADOW_ALLOC_PAD - #define MSHADOW_ALLOC_PAD true -#endif - -/*! - * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation - * for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64 - * set it to 0 then we will always allocate padded memory - */ -#ifndef MSHADOW_MIN_PAD_RATIO - #define MSHADOW_MIN_PAD_RATIO 2 -#endif - -#if MSHADOW_STAND_ALONE - #define MSHADOW_USE_CBLAS 0 - #define MSHADOW_USE_MKL 0 - #define MSHADOW_USE_CUDA 0 -#endif - -/*! \brief use CBLAS for CBLAS */ -#ifndef MSHADOW_USE_CBLAS - #define MSHADOW_USE_CBLAS 0 -#endif -/*! \brief use MKL for BLAS */ -#ifndef MSHADOW_USE_MKL - #define MSHADOW_USE_MKL 1 -#endif -/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */ -#ifndef MSHADOW_USE_CUDA - #define MSHADOW_USE_CUDA 1 -#endif -/*! \brief use single precition float */ -#ifndef MSHADOW_SINGLE_PRECISION - #define MSHADOW_SINGLE_PRECISION 1 -#endif -/*! \brief whether use SSE */ -#ifndef MSHADOW_USE_SSE - #define MSHADOW_USE_SSE 0 -#endif -/*! \brief whether use NVML to get dynamic info */ -#ifndef MSHADOW_USE_NVML - #define MSHADOW_USE_NVML 0 -#endif -// SSE is conflict with cudacc -#ifdef __CUDACC__ - #undef MSHADOW_USE_SSE - #define MSHADOW_USE_SSE 0 -#endif - -#if MSHADOW_USE_CBLAS -extern "C"{ - #include -} -#elif MSHADOW_USE_MKL - #include - #include - #include - #include -#endif - -#if MSHADOW_USE_CUDA - #include - #include -#endif - -#if MSHADOW_USE_NVML - #include -#endif -// -------------------------------- -// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code. -#ifdef MSHADOW_XINLINE - #error "MSHADOW_XINLINE must not be defined" -#endif -#ifdef __CUDACC__ - #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__ -#else - #define MSHADOW_XINLINE inline __attribute__((always_inline)) -#endif -/*! \brief cpu force inline */ -#define MSHADOW_CINLINE inline __attribute__((always_inline)) - -#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L - #define MSHADOW_CONSTEXPR constexpr -#else - #define MSHADOW_CONSTEXPR const -#endif - -/*! \brief namespace for mshadow */ -namespace mshadow { - /*! \brief buffer size for each random number generator */ - const unsigned kRandBufferSize = 1000000; - /*! \brief pi */ - const float kPi = 3.1415926f; - -#if MSHADOW_SINGLE_PRECISION - /*! \brief type that will be used for content */ - typedef float real_t; -#else - typedef double real_t; -#endif - /*! \brief type that will be used for index */ - typedef unsigned index_t; -}; // namespace mshadow - -namespace mshadow { - /*! \brief namespace for operators */ - namespace op { - // binary operator - /*! \brief mul operator */ - struct mul{ - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a * b; - } - }; - /*! \brief plus operator */ - struct plus { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a + b; - } - }; - /*! \brief minus operator */ - struct minus { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a - b; - } - }; - /*! \brief divide operator */ - struct div { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a / b; - } - }; - /*! \brief get rhs */ - struct right { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return b; - } - }; - }; // namespace op - - /*! \brief namespace for savers */ - namespace sv { - /*! \brief save to saver: = */ - struct saveto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a = b; - } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f; - /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static real_t kBetaBLAS = 0.0f; - /*! \brief corresponding binary operator type */ - typedef op::right OPType; - }; - /*! \brief save to saver: += */ - struct plusto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a += b; - } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f; - /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static real_t kBetaBLAS = 1.0f; - /*! \brief corresponding binary operator type */ - typedef op::plus OPType; - }; - /*! \brief minus to saver: -= */ - struct minusto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a -= b; - } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f; - /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static real_t kBetaBLAS = 1.0f; - /*! \brief corresponding binary operator type */ - typedef op::minus OPType; - }; - /*! \brief multiply to saver: *= */ - struct multo { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a *= b; - } - /*! \brief corresponding binary operator type */ - typedef op::mul OPType; - }; - /*! \brief divide to saver: /= */ - struct divto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a /= b; - } - /*! \brief corresponding binary operator type */ - typedef op::div OPType; - }; - }; // namespace sv - - - namespace op { - // unary operator/ function: example - // these operators can be defined by user, in the same style as binary and unary operator - // to use, simply write F( src ) - /*! \brief identity function that maps a real number to it self */ - struct identity{ - /*! \brief map a to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a) { - return a; - } - }; - }; // namespace op - - /*! \brief namespace for potential reducer operations */ - namespace red { - /*! \brief sum reducer */ - struct sum { - /*! \brief do reduction into dst */ - MSHADOW_XINLINE static void Reduce( volatile real_t& dst, volatile real_t src ) { - dst += src; - } - /*! \brief calculate gradient of redres with respect to redsrc, redres: reduced result, redsrc: one of reduction element */ - MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) { - return 1.0f; - } - /*! \brief an intial value of reducer */ - MSHADOW_CONSTEXPR static real_t kInitV = 0.0f; - }; - /*! \brief maximum reducer */ - struct maximum { - /*! \brief do reduction into dst */ - MSHADOW_XINLINE static void Reduce( volatile real_t& dst, volatile real_t src ) { - using namespace std; - dst = max( dst, src ); - } - /*! \brief calculate gradient of redres with respect to redsrc, redres: reduced result, redsrc: one of reduction element */ - MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) { - return redres == redsrc ? 1.0f: 0.0f; - } - /*! \brief an intial value of reducer */ -#if MSHADOW_SINGLE_PRECISION - MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX; -#else - MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX; -#endif - }; - }; - - /*! \brief namespace for helper utils of the project */ - namespace utils{ - /*! \brief send error message then exit */ - inline void Error( const char *msg ){ - fprintf( stderr, "Error:%s\n",msg ); - exit( -1 ); - } - /*! \brief assert a expression is true */ - inline void Assert( bool exp ){ - if( !exp ) Error( "AssertError" ); - } - /*! \brief assert a expression is true */ - inline void Assert( bool exp, const char *msg ){ - if( !exp ) Error( msg ); - } - /*! \brief warning */ - inline void Warning( const char *msg ){ - fprintf( stderr, "warning:%s\n",msg ); - } - }; // namespace utils -}; // namespace mshadow -#endif // TENSOR_BASE_H diff --git a/mshadow/tensor_expr.h b/mshadow/tensor_expr.h deleted file mode 100644 index 39fae450a463..000000000000 --- a/mshadow/tensor_expr.h +++ /dev/null @@ -1,367 +0,0 @@ -#ifndef MSHADOW_TENSOR_EXPR_H -#define MSHADOW_TENSOR_EXPR_H -/*! - * \file tensor_expr.h - * \brief definitions of abstract expressions and expressions template - * \author Tianqi Chen, Bing Xu - */ -#include "tensor_base.h" - -namespace mshadow{ - /*! - * \brief namespace for abstract expressions and expressions template, - * have no dependecy on tensor.h, - * These data structure takes no charge in computations, - * they are only used to define operations and represent expression in a symbolic way - */ - namespace expr{ - - /*! \brief type of expressions */ - namespace type{ - /*! \brief this expression directly correspnds to a data class */ - const int kContainer = 0; - /*! \brief this only contains element-wise vector operations */ - const int kMapper = 1; - /*! \brief othercase: e.g dot product */ - const int kComplex = 3; - }; - - /*! - * \brief expression engine that actually interprets these expressions - * this is a function template that needed to be implemented for specific expressions - */ - template - struct ExpEngine{ - template - inline static void Eval( Container& dst, const EType &exp ); - }; - - template - class ContainerExp; - class ScalarExp; - - /*! - * \brief base class for expression - * \tparam SubType inheritated class must put their type into this parameter - * \tparam exp_type expression type, see namespace type - */ - template - struct Exp{ - public: - /*! \return subtype instance of current class */ - inline const SubType& self( void ) const{ - return *static_cast(this); - } - /*! \return reference of subtype instance of current class */ - inline SubType& refself( void ){ - return *static_cast(this); - } - }; - - /*! \brief scalar expression */ - struct ScalarExp: public Exp{ - /*! \brief scalar value */ - real_t scalar_; - /*! \brief constructor */ - ScalarExp( real_t scalar ):scalar_(scalar){} - }; - - /*! \brief represent a transpose expression of a container */ - template - struct TransposeExp: public Exp< TransposeExp, type::kMapper >{ - public: - /*! \brief expression to be transposed */ - const EType &exp; - /*! \brief constructor */ - TransposeExp( const EType &e ):exp(e){} - /*! \brief transpose expression */ - inline const EType & T( void ) const{ - return exp; - } - }; - - /*! - * \brief base class of all variables, that can be assigned to values - * \tparam Container the actually class of data container, e.g. CTensor1D - */ - template - class ContainerExp: public Exp< Container, type::kContainer >{ - public: - /*! - *\brief transpose of a matrix - *\return transpose of current expression - */ - inline const TransposeExp T( void ) const{ - return TransposeExp( this->self() ); - } - public: - /*! \brief operator overload */ - inline Container &operator+=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &operator-=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &operator*=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &operator/=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &__assign( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - public: - /*! \brief implementation of operator=, note that we can not define container = container */ - template - inline Container &__assign( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator=, note that we can not define container = container */ - template - inline Container &__assign( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator+= */ - template - inline Container &operator+=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator-= */ - template - inline Container &operator-=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator*= */ - template - inline Container &operator*=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator/= */ - template - inline Container &operator/=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - }; - }; // namespace expr - - namespace expr{ - /*! - * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] ) - * \tparam TA type of lhs - * \tparam TB type of rhs - * \tparam ltrans whether lhs is transposed - * \tparam rtrans whether rhs is transposed - */ - template - struct DotExp: public Exp< DotExp, type::kComplex >{ - /*! \brief left operand */ - const TA& lhs_; - /*! \brief right operand */ - const TB& rhs_; - /*! \brief scale over result */ - real_t scale_; - /*! \brief constructor */ - DotExp( const TA &lhs, const TB &rhs, real_t scale ) - :lhs_(lhs),rhs_(rhs),scale_(scale){} - }; - - /*! \brief dot operator def */ - template - inline DotExp dot( const ContainerExp &lhs, const ContainerExp &rhs ){ - return DotExp( lhs.self(), rhs.self(), 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp dot( const TransposeExp &lhs, const ContainerExp &rhs ){ - return DotExp( lhs.exp, rhs.self(), 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp dot( const ContainerExp &lhs, const TransposeExp &rhs ){ - return DotExp( lhs.self(), rhs.exp, 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp dot( const TransposeExp &lhs, const TransposeExp &rhs ){ - return DotExp( lhs.exp, rhs.exp, 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp operator*( const DotExp &lhs, real_t rhs ){ - return DotExp( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs ); - } - /*! \brief scale of dot operation */ - template - inline DotExp operator*( real_t lhs, const DotExp &rhs ){ - return DotExp( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs ); - } - }; // namespace expr - - namespace expr{ - /*! - * \brief binary map expression lhs [op] rhs - * \tparam OP operator - * \tparam TA type of lhs - * \tparam TB type of rhs - * \tparam etype expression type, sa namespace::type - */ - template - struct BinaryMapExp: public Exp< BinaryMapExp, etype >{ - /*! \brief left operand */ - const TA& lhs_; - /*! \brief right operand */ - const TB& rhs_; - /*! \brief constructor */ - BinaryMapExp( const TA &lhs, const TB &rhs ) - :lhs_(lhs), rhs_(rhs){} - }; - - /*! \brief make expression */ - template - inline BinaryMapExp MakeExp( const Exp &lhs, const Exp &rhs ){ - return BinaryMapExp( lhs.self(), rhs.self() ); - } - - /*! - * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression - * \param lhs left operand - * \param rhs right operand - * \tparam binary operator - * \tparam TA lhs expression - * \tparam ta lhs expression type - * \tparam TB rhs expression - * \tparam tb rhs expression type - * \sa mshadow::op - */ - template - inline BinaryMapExp F( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload for const */ - template - inline BinaryMapExp F( const Exp &lhs, const ScalarExp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload for const */ - template - inline BinaryMapExp F( const ScalarExp &lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - - // operator rules - /*! \brief operator overload */ - template - inline BinaryMapExp operator+( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator-( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator*( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator/( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - // constant operators - /*! \brief operator overload */ - template - inline BinaryMapExp operator+( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator-( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator*( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator/( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - // constant operators 2 - /*! \brief operator overload */ - template - inline BinaryMapExp operator+( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator-( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator*( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator/( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - }; - - namespace expr{ - /*! - * \brief unary map expression op(src) - * \tparam OP operator - * \tparam TA type of src - * \tparam etype expression type, sa namespace::type - */ - template - struct UnaryMapExp: public Exp< UnaryMapExp, etype >{ - /*! \brief source expression */ - const TA& src_; - /*! \brief constructor */ - UnaryMapExp( const TA &src ):src_(src){} - }; - - /*! \brief make expression */ - template - inline UnaryMapExp MakeExp( const Exp &src ){ - return UnaryMapExp( src.self() ); - } - - /*! - * \brief short hand for MakeExp, usage F(src), create a unary operation expression - * \param src source expression - * \tparam operator - * \tparam TA source expression - * \tparam ta source expression type - * \sa mshadow::op - */ - template - inline UnaryMapExp F( const Exp &src ){ - return MakeExp(src); - } - }; -}; -#endif diff --git a/mshadow/utils.h b/mshadow/utils.h new file mode 100644 index 000000000000..33b038d2418a --- /dev/null +++ b/mshadow/utils.h @@ -0,0 +1,79 @@ +#ifndef MSHADOW_UTILS_H_ +#define MSHADOW_UTILS_H_ +/*! + * \file utils.h + * \brief simple utils for error and checkings + * \author Tianqi Chen + */ +#define _CRT_SECURE_NO_WARNINGS +#include +#include +#include +namespace mshadow { +/*! \brief namespace for helper utils of the project */ +namespace utils { +/*! \brief error message buffer length */ +const int kPrintBuffer = 1 << 12; + +#ifndef MSHADOW_CUSTOMIZE_ASSERT_ +/*! + * \brief handling of Assert error, caused by in-apropriate input + * \param msg error message + */ +inline void HandleAssertError(const char *msg) { + fprintf(stderr, "AssertError:%s\n", msg); + exit(-1); +} +/*! + * \brief handling of Check error, caused by in-apropriate input + * \param msg error message + */ +inline void HandleCheckError(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(-1); +} +#else +// include declarations, some one must implement this +void HandleAssertError(const char *msg); +void HandleCheckError(const char *msg); +void HandlePrint(const char *msg); +#endif + +/*! \brief assert an condition is true, use this to handle debug information */ +inline void Assert(bool exp, const char *fmt, ...) { + if (!exp) { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandleAssertError(msg.c_str()); + } +} + +/*!\brief same as assert, but this is intended to be used as message for user*/ +inline void Check(bool exp, const char *fmt, ...) { + if (!exp) { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandleCheckError(msg.c_str()); + } +} + +/*! \brief report error message, same as check */ +inline void Error(const char *fmt, ...) { + { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandleCheckError(msg.c_str()); + } +} +} // namespace utils +} // namespace mshadow +#endif From ecfadfc6d96c0d41c7e92f722c4334dac567d2c9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 24 Dec 2014 06:40:23 -0800 Subject: [PATCH 014/147] fix the operator issues in scalar with macro --- mshadow/expr_engine-inl.h | 460 +++++++++++++++++++++++++++++ mshadow/expression-inl.h | 86 ++++++ mshadow/expression.h | 236 ++++++--------- mshadow/tensor.h | 38 ++- mshadow/tensor_expr_engine-inl.hpp | 446 ---------------------------- 5 files changed, 649 insertions(+), 617 deletions(-) create mode 100644 mshadow/expr_engine-inl.h create mode 100644 mshadow/expression-inl.h delete mode 100644 mshadow/tensor_expr_engine-inl.hpp diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h new file mode 100644 index 000000000000..5ddfd53afd39 --- /dev/null +++ b/mshadow/expr_engine-inl.h @@ -0,0 +1,460 @@ +#ifndef MSHADOW_EXPR_ENGINE_INL_H_ +#define MSHADOW_EXPR_ENGINE_INL_H_ +/*! + * \file texpr_engine-inl.h + * \brief definitions of how expressions should be evaluated + * \author Tianqi Chen, Bing Xu + */ +#include "./expression.h" +#include "./tensor.h" + +namespace mshadow { +namespace expr { +/*! + * \brief a general class that allows extension that makes tensors of some shape + * \tparam SubType type of subclass + * \tparam SrcExp source expression of the MakeTensorExp, the source of operation + * \tparam dim dimension of the expression + */ +template +struct MakeTensorExp: public Exp< MakeTensorExp, type::kChainer> { + /*! \brief the shape of this expression */ + Shape shape_; + /*! \brief true self of subtype */ + inline const SubType& real_self(void) const{ + return *static_cast(this); + } +}; +//---------------------------------------------------------------------- +// This part of code gives plan that can be used to carry out execution +//--------------------------------------------------------------------- +// Declarations of plans +template +class Plan { + public: + /*! + * \brief evaluate the expression at index [y][x] + * to be implemented by SubType, for RValue, the return type will be DType & + */ + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const; +}; +// tensor plan +template +class Plan, DType> { + public: + Plan(const Tensor &t) + : dptr_(t.dptr), stride_(t.shape.stride_) {} + // for RValue, the return type should be reference + MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { + return dptr_[y * stride_ + x]; + } + // const evaluation + MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const { + return dptr_[y * stride_ + x]; + } + + private: + real_t *dptr_; + index_t stride_; +}; +// special evaluation case for 1d tensor, no stride +template +class Plan, DType> { + public: + Plan(const Tensor &t) : dptr_(t.dptr) {} + MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { + return dptr_[x]; + } + MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const { + return dptr_[x]; + } + + private: + real_t *dptr_; +}; +// scalar +template +class Plan, DType> { + public: + Plan(DType scalar) : scalar_(scalar) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return scalar_; + } + + private: + DType scalar_; +}; +// binary expression +template +class Plan, DType> { + public: + Plan(const Plan &lhs, const Plan &rhs) + : lhs_(lhs), rhs_(rhs) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); + } + + private: + Plan lhs_; + Plan rhs_; +}; +// unary expression +template +class Plan, DType> { + public: + Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(src_.Eval(y, x)); + } + + private: + Plan src_; +}; +// remaps map tensor expression to subtype's plan +template +struct Plan< MakeTensorExp > { + public: + Plan(const Plan &src):src_(src) {} + MSHADOW_XINLINE real_t Eval(index_t y, index_t x) const { + return src_.Eval(y, x); + } + + private: + Plan src_; +}; +// tranpsoe +template +class Plan, DType> { + public: + Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(x, y); + } + + private: + Plan src_; +}; +//---------------------------------------------------------------------- +// Mappings from expression to plans +//--------------------------------------------------------------------- +template +inline Plan, DType> MakePlan(const BinaryMapExp &e); + +template +inline Plan MakePlan(const ScalarExp &e) { + return Plan(e.scalar_); +} + +template +inline Plan MakePlan(const RValueExp &e) { + return Plan(e.self()); +} + +template +inline Plan, DType> MakePlan(const TransposeExp &e) { + return Plan >(MakePlan(e.exp)); +} + +template +inline Plan< T > MakePlan(const MakeTensorExp &e) { + return Plan< T >(e.real_self()); +} + +template +inline Plan< UnaryMapExp > MakePlan(const UnaryMapExp &e) { + return Plan< UnaryMapExp >(MakePlan(e.src_)); +} + +template +inline Plan< BinaryMapExp > MakePlan(const BinaryMapExp &e) { + return Plan< BinaryMapExp >(MakePlan(e.lhs_), MakePlan(e.rhs_)); +} +}; // namespace expr + + namespace expr{ + /*! + * \brief static type inference template, + * used to get the dimension of each expression, + * if ExpInfo::kDim == -1, this means here are mismatch in expression + * if (ExpInfo::kDevMask & cpu::kDevMask) != 0, this means this expression can be assigned to cpu + * \tparam E expression + */ + template + struct ExpInfo{ + const static int kDim = -1; + const static int kDevMask = 0; + }; + template<> + struct ExpInfo { + const static int kDim = 0; + const static int kDevMask = 0xffff; + }; + template + struct ExpInfo > { + const static int kDim = ExpInfo::kDim; + const static int kDevMask = ExpInfo::kDevMask; + }; + template + struct ExpInfo< Tensor > { + const static int kDim = dim; + const static int kDevMask = Device::kDevMask; + }; + template + struct ExpInfo< MakeTensorExp > { + const static int kDimSrc = ExpInfo::kDim; + const static int kDim = kDimSrc >= 0 ? dim : -1; + const static int kDevMask = ExpInfo::kDevMask; + }; + template + struct ExpInfo< UnaryMapExp > { + const static int kDim = ExpInfo::kDim; + const static int kDevMask = ExpInfo::kDevMask; + }; + template + struct ExpInfo< BinaryMapExp > { + const static int kDimLhs = ExpInfo::kDim; + const static int kDimRhs = ExpInfo::kDim; + const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ + (kDimLhs==0 ? kDimRhs : ((kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1)):-1; + const static int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; + }; + + /*! \brief template to do type check */ + template + struct TypeCheck{ + /*! \brief dimension of expression*/ + const static int kExpDim = ExpInfo::kDim; + /*! \brief whether the expression device type matches */ + const static bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; + /*! \brief whether the expression can be mapped to expression of dim */ + const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; + /*! \brief whether the expression can be reduced to expression of dim */ + const static bool kRedPass = (kExpDim > dim) && kDevPass; + }; + + template + struct TypeCheckPass; + template<> + struct TypeCheckPass {}; + template<> + struct TypeCheckPass { + inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {} + inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {} + inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {} + }; + }; // namespace expr + + namespace expr{ + // check shape consistency + template + struct ShapeCheck{ + inline static Shape Check(const E &t); + }; + + template + struct ShapeCheck { + inline static Shape Check(const ScalarExp &exp) { + // use lowest dimension to mark scalar exp + Shape shape; shape[0] = 0; + return shape; + } + }; + template + struct ShapeCheck > { + inline static Shape Check(const TransposeExp< E > &e) { + // swap the lowest two dimensions + Shape s = ShapeCheck::Check(e.exp); + std::swap(s[0], s[1]); + return s; + } + }; + template + struct ShapeCheck > { + inline static Shape Check(const Tensor &t) { + return t.shape; + } + }; + template + struct ShapeCheck > { + inline static Shape Check(const MakeTensorExp &t) { + return t.shape_; + } + }; + template + struct ShapeCheck< dim,UnaryMapExp > { + inline static Shape Check(const UnaryMapExp &t) { + Shape s = ShapeCheck::Check(t.src_); + return s; + } + }; + template + struct ShapeCheck< dim, BinaryMapExp > { + inline static Shape Check(const BinaryMapExp &t) { + Shape shape1 = ShapeCheck::Check(t.lhs_); + Shape shape2 = ShapeCheck::Check(t.rhs_); + if(shape1[0] == 0) return shape2; + if(shape2[0] == 0) return shape1; + utils::Assert(shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same"); + return shape1; + } + }; + }; // namespace expr + + // the matrix OP depends on BLAS + namespace expr{ + template + struct DotEngine{ + inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale); + }; + + // handles the dot + template + struct BLASEngine; + + #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) + template<> + struct BLASEngine { + inline static CBLAS_TRANSPOSE GetT(bool t) { + return t ? CblasTrans : CblasNoTrans; + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, \ + const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { + cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, \ + const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { + cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ + const float *X, int incX, float beta, float *Y, int incY) { + cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ + const double *X, int incX, double beta, double *Y, int incY) { + cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { + cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + } + inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { + cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + } + }; + #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL + + #if MSHADOW_USE_CUDA + // All CuBLAS goes to here, use legacy API: not threadsafe + template<> + struct BLASEngine { + inline static char GetT(bool t) { + return t ? 'T' : 'N'; + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { + cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, + const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { + cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ + const float *X, int incX, float beta, float *Y, int incY) { + cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ + const double *X, int incX, double beta, double *Y, int incY) { + cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { + cublasSger(m,n,alpha,X,incX,Y,incY,A,lda); + } + inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { + cublasDger(m,n,alpha,X,incX,Y,incY,A,lda); + } + }; + #endif + + // helper function to decide which shape we are in + inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { + return transpose ? Shape2(shape[0],shape[1]) : shape; + } + // dst = dot(lhs[.T], rhs[.T]) + template + struct DotEngine { + inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale) { + Shape<2> sleft = GetShape(lhs.shape, transpose_left); + Shape<2> sright = GetShape(rhs.shape, transpose_right); + utils::Assert(dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \ + && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch"); + // use column major argument to compatible with most BLAS + BLASEngine::gemm + (transpose_right , transpose_left, + transpose_right ? rhs.shape[1] : rhs.shape[0], + transpose_left ? lhs.shape[0] : lhs.shape[1], + transpose_right ? rhs.shape[0] : rhs.shape[1], + scale * SV::kAlphaBLAS, + rhs.dptr, rhs.shape.stride_, + lhs.dptr, lhs.shape.stride_, + SV::kBetaBLAS, + dst.dptr, dst.shape.stride_); + } + }; + template + struct DotEngine { + inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale) { + Shape<2> sright = GetShape(rhs.shape, transpose_right); + utils::Assert(dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch"); + BLASEngine::gemv + (transpose_right, + rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS, + rhs.dptr, rhs.shape.stride_, + lhs.dptr, 1, SV::kBetaBLAS, + dst.dptr, 1); + } + }; + template + struct DotEngine { + inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale) { + utils::Assert(dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch"); + if(SV::kBetaBLAS < 1e-6f) { + BLASEngine::ger + (rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS, + rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_); + }else{ + DotEngine::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale); + } + } + }; + + }; // namespace expr + + namespace expr{ + /*! \brief some engine that evaluate complex expression */ + template + struct ExpComplexEngine{ + inline static void Eval(Tensor& dst, const E &exp); + }; + template + struct ExpEngine > { + template + inline static void Eval(Tensor& dst, const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(Tensor& dst, const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(Tensor& dst, const Exp &exp) { + ExpComplexEngine::Eval(dst, exp.self()); + } + }; + template + struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor, Tensor, ltrans, rtrans > > { + inline static void Eval(Tensor &dst, const DotExp< Tensor, Tensor, ltrans, rtrans > &exp) { + DotEngine::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_); + } + }; + }; // namespace expr +}; +#endif diff --git a/mshadow/expression-inl.h b/mshadow/expression-inl.h new file mode 100644 index 000000000000..b2897ca44faa --- /dev/null +++ b/mshadow/expression-inl.h @@ -0,0 +1,86 @@ +/*! + * \file expression-inl.h + * \brief definitions of operators in expression with respect to scalar + * this file will be included several times, each time with MACRO MSHADOW_SCALAR_ to be different types + * \author Tianqi Chen, Bing Xu + */ +namespace mshadow { +namespace expr { +// DotExp +/*! \brief dot operator def */ +template +inline DotExp +operator*(const DotExp &lhs, MSHADOW_SCALAR_ rhs) { + return DotExp(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs); +} +/*! \brief scale of dot operation */ +template +inline DotExp +operator*(MSHADOW_SCALAR_ &lhs, const DotExp &rhs) { + return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); +} + +/*! \brief operator overload for const */ +template +inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> +F(const Exp &lhs, MSHADOW_SCALAR_ rhs) { + return MakeExp(lhs, scalar(rhs)); +} +/*! \brief operator overload for const */ +template +inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +F(MSHADOW_SCALAR_ lhs, const Exp &rhs) { + return MakeExp(scalar(lhs), rhs); +} +// constant operators +/*! \brief operator overload */ +template +inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> +operator+(const Exp &lhs, const MSHADOW_SCALAR_ &rhs) { + return MakeExp(lhs, scalar(rhs)); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> +operator-(const Exp &lhs, const MSHADOW_SCALAR_ &rhs) { + return MakeExp(lhs, scalar(rhs)); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> +operator*(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> +operator/(const Exp &lhs, const MSHADOW_SCALAR_ &rhs) { + return MakeExp(lhs, scalar(rhs)); +} +// constant operators 2 +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +operator+(MSHADOW_SCALAR_ lhs, const Exp &rhs) { + return MakeExp(scalar(lhs), rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +operator-(MSHADOW_SCALAR_ lhs, const Exp &rhs) { + return MakeExp(scalar(lhs), rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +operator*(MSHADOW_SCALAR_ lhs, const Exp &rhs) { + return MakeExp(scalar(lhs), rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +operator/(MSHADOW_SCALAR_ lhs, const Exp &rhs) { + return MakeExp(scalar(lhs), rhs); +} +} // namespace expr +} // namespace mshadow diff --git a/mshadow/expression.h b/mshadow/expression.h index 554997df85b6..7799781e162e 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -48,9 +48,10 @@ struct ExpEngine { /*! * \brief base class for expression * \tparam SubType inheritated class must put their type into this parameter + * \tparam DType the data type of each element in the expression * \tparam exp_type expression type, see namespace type */ -template +template struct Exp { public: /*! \return subtype instance of current class */ @@ -67,7 +68,7 @@ struct Exp { * \tparam DType the data type of the scalar */ template -struct ScalarExp: public Exp, type::kMapper> { +struct ScalarExp: public Exp, DType, type::kMapper> { /*! \brief scalar value */ DType scalar_; /*! \brief constructor, must be implicit for implicit conversion */ @@ -75,7 +76,7 @@ struct ScalarExp: public Exp, type::kMapper> { }; /*! \brief create an scalar expression */ template -inline ScalarExp scalar(DType s) { +inline ScalarExp scalar(const DType &s) { return ScalarExp(s); } /*! @@ -85,19 +86,19 @@ inline ScalarExp scalar(DType s) { * \tparam etype the type of expression after cast */ template -struct TypecastExp: public Exp, etype> { +struct TypecastExp: public Exp, DType, etype> { const EType &exp; /*! \brief constructor */ explicit TypecastExp(const EType &e) : exp(e) {} }; /*! \brief create an scalar expression */ -template -inline TypecastExp tcast(const Exp &exp) { +template +inline TypecastExp tcast(const Exp &exp) { return TypecastExp(exp.self()); } /*! \brief represent a transpose expression of a container */ -template -struct TransposeExp: public Exp, type::kChainer> { +template +struct TransposeExp: public Exp, DType, type::kChainer> { /*! \brief expression to be transposed */ const EType &exp; /*! \brief constructor */ @@ -113,79 +114,76 @@ struct TransposeExp: public Exp, type::kChainer> { * \tparam DataType the element data type of each element in the container */ template -class RValueExp: public Exp { +class RValueExp: public Exp { public: /*! *\brief transpose of a matrix *\return transpose of current expression */ - inline const TransposeExp T(void) const { - return TransposeExp(this->self()); + inline const TransposeExp T(void) const { + return TransposeExp(this->self()); } /*! \brief operator overload */ - inline Container &operator+=(default_real_t s) { - ExpEngine::Eval(this->refself(), scalar(s)); + inline Container &operator+=(DType s) { + ExpEngine::Eval(this->refself(), scalar(s)); return this->refself(); } - /*! \brief operator overload */ - inline Container &operator-=(default_real_t s) { - ExpEngine::Eval(this->refself(), scalar(s)); + inline Container &operator-=(DType s) { + ExpEngine::Eval(this->refself(), scalar(s)); return this->refself(); } - /*! \brief operator overload */ - inline Container &operator*=(default_real_t s) { - ExpEngine::Eval(this->refself(), scalar(s)); + inline Container &operator*=(DType s) { + ExpEngine::Eval(this->refself(), scalar(s)); return this->refself(); } - /*! \brief operator overload */ - inline Container &operator/=(default_real_t s) { - ExpEngine::Eval(this->refself(), scalar(s)); + inline Container &operator/=(DType s) { + ExpEngine::Eval(this->refself(), scalar(s)); return this->refself(); } /*! \brief operator overload */ - inline Container &__assign(default_real_t s) { - ExpEngine::Eval(this->refself(), scalar(s)); + inline Container &__assign(DType s) { + ExpEngine::Eval(this->refself(), scalar(s)); return this->refself(); } /*! \brief implementation of operator=, note that we can not define container = container */ template - inline Container &__assign(const Exp &exp) { + inline Container &__assign(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } /*! \brief implementation of operator=, note that we can not define conatiner = container */ template - inline Container &__assign(const Exp &exp) { + inline Container &__assign(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } /*! \brief implementation of operator=, note that we can not define container = container */ template - inline Container &__assign(const Exp &exp) { + inline Container &__assign(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } /*! \brief implementation of operator+= */ template - inline Container &operator+=(const Exp &exp) { + inline Container &operator+=(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } /*! \brief implementation of operator-= */ template - inline Container &operator-=(const Exp &exp) { + inline Container &operator-=(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } /*! \brief implementation of operator*= */ template - inline Container &operator*=(const Exp &exp) { + inline Container &operator*=(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } /*! \brief implementation of operator/= */ template - inline Container &operator/=(const Exp &exp) { + inline Container &operator/=(const Exp &exp) { ExpEngine::Eval(this->refself(), exp.self()); return this->refself(); } @@ -197,54 +195,42 @@ class RValueExp: public Exp { * \tparam ltrans whether lhs is transposed * \tparam rtrans whether rhs is transposed */ -template -struct DotExp: public Exp, type::kComplex> { +template +struct DotExp: public Exp, DType, type::kComplex> { /*! \brief left operand */ const TA &lhs_; /*! \brief right operand */ const TB &rhs_; /*! \brief scale over result */ - default_real_t scale_; + DType scale_; /*! \brief constructor */ - explicit DotExp(const TA &lhs, const TB &rhs, default_real_t scale) + explicit DotExp(const TA &lhs, const TB &rhs, DType scale) : lhs_(lhs), rhs_(rhs), scale_(scale) {} }; // definition of dot expression /*! \brief dot operator def */ template -inline DotExp +inline DotExp dot(const RValueExp &lhs, const RValueExp &rhs) { - return DotExp(lhs.self(), rhs.self(), 1.0f); + return DotExp(lhs.self(), rhs.self(), 1.0f); } /*! \brief dot operator def */ template -inline DotExp -dot(const TransposeExp &lhs, const RValueExp &rhs) { - return DotExp(lhs.exp, rhs.self(), 1.0f); +inline DotExp +dot(const TransposeExp &lhs, const RValueExp &rhs) { + return DotExp(lhs.exp, rhs.self(), 1.0f); } /*! \brief dot operator def */ template -inline DotExp -dot(const RValueExp &lhs, const TransposeExp &rhs) { - return DotExp(lhs.self(), rhs.exp, 1.0f); -} -/*! \brief dot operator def */ -template -inline DotExp -dot(const TransposeExp &lhs, const TransposeExp &rhs) { - return DotExp(lhs.exp, rhs.exp, 1.0f); +inline DotExp +dot(const RValueExp &lhs, const TransposeExp &rhs) { + return DotExp(lhs.self(), rhs.exp, 1.0f); } /*! \brief dot operator def */ -template -inline DotExp -operator*(const DotExp &lhs, default_real_t rhs) { - return DotExp(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs); -} -/*! \brief scale of dot operation */ -template -inline DotExp -operator*(default_real_t lhs, const DotExp &rhs) { - return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); +template +inline DotExp +dot(const TransposeExp &lhs, const TransposeExp &rhs) { + return DotExp(lhs.exp, rhs.exp, 1.0f); } //--------------- // BinaryMapExp @@ -256,8 +242,8 @@ operator*(default_real_t lhs, const DotExp &rhs) { * \tparam TB type of rhs * \tparam etype expression type, sa namespace::type */ -template -struct BinaryMapExp: public Exp< BinaryMapExp, etype> { +template +struct BinaryMapExp: public Exp, DType, etype> { /*! \brief left operand */ const TA &lhs_; /*! \brief right operand */ @@ -268,10 +254,10 @@ struct BinaryMapExp: public Exp< BinaryMapExp, etype> { }; /*! \brief make expression */ -template -inline BinaryMapExp -MakeExp(const Exp &lhs, const Exp &rhs) { - return BinaryMapExp(lhs.self(), rhs.self()); +template +inline BinaryMapExp +MakeExp(const Exp &lhs, const Exp &rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); } /*! * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression @@ -284,96 +270,34 @@ MakeExp(const Exp &lhs, const Exp &rhs) { * \tparam tb rhs expression type * \sa mshadow::op */ -template -inline BinaryMapExp -F(const Exp &lhs, const Exp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload for const */ -template -inline BinaryMapExp, (ta|type::kMapper)> -F(const Exp &lhs, const ScalarExp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload for const */ -template -inline BinaryMapExp, TB, (tb|type::kMapper)> -F(const ScalarExp &lhs, const Exp &rhs) { +template +inline BinaryMapExp +F(const Exp &lhs, const Exp &rhs) { return MakeExp(lhs, rhs); } // operator rules /*! \brief operator overload */ -template -inline BinaryMapExp -operator+(const Exp &lhs, const Exp &rhs) { +template +inline BinaryMapExp +operator+(const Exp &lhs, const Exp &rhs) { return MakeExp(lhs, rhs); } /*! \brief operator overload */ -template -inline BinaryMapExp -operator-(const Exp &lhs, const Exp &rhs) { +template +inline BinaryMapExp +operator-(const Exp &lhs, const Exp &rhs) { return MakeExp(lhs, rhs); } /*! \brief operator overload */ -template -inline BinaryMapExp -operator*(const Exp &lhs, const Exp &rhs) { +template +inline BinaryMapExp +operator*(const Exp &lhs, const Exp &rhs) { return MakeExp(lhs, rhs); } /*! \brief operator overload */ -template -inline BinaryMapExp -operator/(const Exp &lhs, const Exp &rhs) { - return MakeExp(lhs, rhs); -} -// constant operators -/*! \brief operator overload */ -template -inline BinaryMapExp, (ta|type::kMapper)> -operator+(const Exp &lhs, const ScalarExp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload */ -template -inline BinaryMapExp, (ta|type::kMapper)> -operator-(const Exp &lhs, const ScalarExp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload */ -template -inline BinaryMapExp, (ta|type::kMapper)> -operator*(const Exp &lhs, const ScalarExp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload */ -template -inline BinaryMapExp, (ta|type::kMapper)> -operator/(const Exp &lhs, const ScalarExp &rhs) { - return MakeExp(lhs, rhs); -} -// constant operators 2 -/*! \brief operator overload */ -template -inline BinaryMapExp, TB, (tb|type::kMapper)> -operator+(const ScalarExp &lhs, const Exp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload */ -template -inline BinaryMapExp, TB, (tb|type::kMapper)> -operator-(const ScalarExp &lhs, const Exp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload */ -template -inline BinaryMapExp, TB, (tb|type::kMapper)> -operator*(const ScalarExp &lhs, const Exp &rhs) { - return MakeExp(lhs, rhs); -} -/*! \brief operator overload */ -template -inline BinaryMapExp, TB, (tb|type::kMapper)> -operator/(const ScalarExp &lhs, const Exp &rhs) { +template +inline BinaryMapExp +operator/(const Exp &lhs, const Exp &rhs) { return MakeExp(lhs, rhs); } //--------------- @@ -385,8 +309,8 @@ operator/(const ScalarExp &lhs, const Exp &rhs) { * \tparam TA type of src * \tparam etype expression type, sa namespace::type */ -template -struct UnaryMapExp: public Exp< UnaryMapExp, etype> { +template +struct UnaryMapExp: public Exp, DType, etype> { /*! \brief source expression */ const TA &src_; /*! \brief constructor */ @@ -394,9 +318,9 @@ struct UnaryMapExp: public Exp< UnaryMapExp, etype> { }; /*! \brief make expression */ -template -inline UnaryMapExp MakeExp(const Exp &src) { - return UnaryMapExp(src.self()); +template +inline UnaryMapExp MakeExp(const Exp &src) { + return UnaryMapExp(src.self()); } /*! * \brief short hand for MakeExp, usage F(src), create a unary operation expression @@ -406,12 +330,22 @@ inline UnaryMapExp MakeExp(const Exp &src) { * \tparam ta source expression type * \sa mshadow::op */ -template -inline UnaryMapExp F(const Exp &src) { +template +inline UnaryMapExp F(const Exp &src) { return MakeExp(src); } - - } // namespace expr } // namespace mshadow + + +#ifdef MSAHDOW_SCALAR_ + #error "MSHADOW_SCALAR_ must not be defined" +#endif +// enumerate all the scalar data type we aim to be good at +#define MSHADOW_SCALAR_ float +#include "./expression-inl.h" +#undef MSHADOW_SCALAR_ +#define MSHADOW_SCALAR_ double +#include "./expression-inl.h" +#undef MSHADOW_SCALAR_ #endif // MSHADOW_EXPRESSION_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 87b31192a502..d54a7c9d4abc 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -282,22 +282,21 @@ struct Tensor: public TRValue, Device, dimensio return Tensor(dptr + s.SubShape().MSize() * begin, s); } /*!\brief functions to fit expression template */ - inline Tensor &operator=(default_real_t s) { - return this->__assign(s); - } - /*!\brief functions to fit expression template */ template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor &operator=(const expr::Exp &exp) { return this->__assign(exp); } /*!\brief functions to fit expression template */ template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor &operator=(const expr::Exp &exp) { return this->__assign(exp); } /*!\brief functions to fit expression template */ template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + inline Tensor &operator=(const DType &exp) { return this->__assign(exp); } }; @@ -328,20 +327,19 @@ struct Tensor: public expr::RValueExp } MSHADOW_XINLINE DType &operator[](index_t idx) { return dptr[idx]; } MSHADOW_XINLINE const DType &operator[](index_t idx)const { return dptr[idx]; } - // functions to fit expression template - inline Tensor &operator=(double s) { - return this->__assign(s); - } template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor &operator=(const expr::Exp &exp) { return this->__assign(exp); } template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor &operator=(const expr::Exp &exp) { return this->__assign(exp); } template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor &operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + inline Tensor &operator=(const DType &exp) { return this->__assign(exp); } }; @@ -441,10 +439,10 @@ inline void Softmax(Tensor dst, const Tensor &energy); * \sa namespace mshadow:sv, mshadow::op, mshadow::expr */ template -inline void MapExp(TRValue dst, const expr::Exp &exp); +inline void MapExp(TRValue dst, const expr::Exp &exp); /*! \brief refer to comment of cpu ver \sa MapExp */ template -inline void MapExp(TRValue dst, const expr::Exp &exp); +inline void MapExp(TRValue dst, const expr::Exp &exp); /*! * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) * \tparam Saver specify storage method @@ -459,10 +457,10 @@ inline void MapExp(TRValue dst, const expr::Exp &e * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr */ template -inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); +inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ template -inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); +inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); /*! * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) * \tparam Saver specify storage method @@ -478,9 +476,9 @@ inline void MapReduceKeepLowest(TRValue dst, const expr::Exp -inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); +inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ template -inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); +inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); } // namespace mshadow #endif // TENSOR_H diff --git a/mshadow/tensor_expr_engine-inl.hpp b/mshadow/tensor_expr_engine-inl.hpp deleted file mode 100644 index fe72b3e366ed..000000000000 --- a/mshadow/tensor_expr_engine-inl.hpp +++ /dev/null @@ -1,446 +0,0 @@ -#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP -#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP -/*! - * \file tensor_expr_engine-inl.hpp - * \brief definitions of how expressions should be evaluated - * \author Tianqi Chen, Bing Xu - */ -#include "tensor_expr.h" -#include "tensor.h" - -namespace mshadow{ - namespace expr{ - /*! - * \brief a general class that allows extension that makes tensors of some shape - * \tparam SubType type of subclass - * \tparam SrcExp source expression of the MakeTensorExp, the source of operation - * \tparam dim dimension of the expression - */ - template - struct MakeTensorExp: public Exp< MakeTensorExp, type::kMapper >{ - /*! \brief the shape of this expression */ - Shape shape_; - /*! \brief true self of subtype */ - inline const SubType& real_self( void ) const{ - return *static_cast(this); - } - }; - }; - - namespace expr{ - /*! \brief This part of code gives plan that can be used to carry out execution */ - template - class Plan{ - public: - /*! - * \brief evaluate the expression at index [y][x] - * to be implemented by SubType - */ - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const; - }; - - template - class Plan< Tensor >{ - public: - Plan( const Tensor &t ) - :dptr_(t.dptr),stride_(t.shape.stride_){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ y * stride_ + x ]; - } - private: - const real_t *dptr_; - index_t stride_; - }; - // special evaluation case for 1d tensor - template - class Plan< Tensor >{ - public: - Plan( const Tensor &t ):dptr_(t.dptr){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ x ]; - } - private: - const real_t *dptr_; - }; - - template<> - class Plan{ - public: - Plan( real_t scalar ):scalar_(scalar){} - /*! \brief evaluate at [y][x] */ - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return scalar_; - } - private: - real_t scalar_; - }; - - template - class Plan< BinaryMapExp >{ - public: - Plan( const Plan &lhs, const Plan &rhs ) - :lhs_(lhs), rhs_(rhs){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) ); - } - private: - Plan lhs_; - Plan rhs_; - }; - - template - class Plan< UnaryMapExp >{ - public: - Plan( const Plan &src ):src_(src){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( src_.Eval( y, x ) ); - } - private: - Plan src_; - }; - - - template - struct Plan< MakeTensorExp >{ - public: - Plan( const Plan &src ):src_(src){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return src_.Eval( y, x ); - } - private: - Plan src_; - }; - - template - class Plan< TransposeExp >{ - public: - Plan( const Plan &src ):src_(src){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return src_.Eval( x, y ); - } - private: - Plan src_; - }; - - // allow UnaryMap see the plan - template - inline Plan< BinaryMapExp > MakePlan( const BinaryMapExp &e ); - - // translate from exp to execution plan - inline Plan MakePlan( const ScalarExp &e ){ - return Plan( e.scalar_ ); - } - - template - inline Plan MakePlan( const ContainerExp &e ){ - return Plan( e.self() ); - } - - template - inline Plan > MakePlan( const TransposeExp &e ){ - return Plan >( MakePlan(e.exp) ); - } - - template - inline Plan< T > MakePlan( const MakeTensorExp &e ){ - return Plan< T >( e.real_self() ); - } - - template - inline Plan< UnaryMapExp > MakePlan( const UnaryMapExp &e ){ - return Plan< UnaryMapExp >( MakePlan(e.src_) ); - } - - template - inline Plan< BinaryMapExp > MakePlan( const BinaryMapExp &e ){ - return Plan< BinaryMapExp >( MakePlan(e.lhs_), MakePlan(e.rhs_) ); - } - }; // namespace expr - - namespace expr{ - /*! - * \brief static type inference template, - * used to get the dimension of each expression, - * if ExpInfo::kDim == -1, this means here are mismatch in expression - * if ( ExpInfo::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu - * \tparam E expression - */ - template - struct ExpInfo{ - const static int kDim = -1; - const static int kDevMask = 0; - }; - template<> - struct ExpInfo{ - const static int kDim = 0; - const static int kDevMask = 0xffff; - }; - template - struct ExpInfo >{ - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< Tensor >{ - const static int kDim = dim; - const static int kDevMask = Device::kDevMask; - }; - template - struct ExpInfo< MakeTensorExp >{ - const static int kDimSrc = ExpInfo::kDim; - const static int kDim = kDimSrc >= 0 ? dim : -1; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< UnaryMapExp >{ - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< BinaryMapExp >{ - const static int kDimLhs = ExpInfo::kDim; - const static int kDimRhs = ExpInfo::kDim; - const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ - ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1; - const static int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; - }; - - /*! \brief template to do type check */ - template - struct TypeCheck{ - /*! \brief dimension of expression*/ - const static int kExpDim = ExpInfo::kDim; - /*! \brief whether the expression device type matches */ - const static bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; - /*! \brief whether the expression can be mapped to expression of dim */ - const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; - /*! \brief whether the expression can be reduced to expression of dim */ - const static bool kRedPass = (kExpDim > dim) && kDevPass; - }; - - template - struct TypeCheckPass; - template<> - struct TypeCheckPass{}; - template<> - struct TypeCheckPass{ - inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){} - inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){} - inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){} - }; - }; // namespace expr - - namespace expr{ - // check shape consistency - template - struct ShapeCheck{ - inline static Shape Check( const E &t ); - }; - - template - struct ShapeCheck{ - inline static Shape Check( const ScalarExp &exp ){ - // use lowest dimension to mark scalar exp - Shape shape; shape[0] = 0; - return shape; - } - }; - template - struct ShapeCheck >{ - inline static Shape Check( const TransposeExp< E > &e ){ - // swap the lowest two dimensions - Shape s = ShapeCheck::Check( e.exp ); - std::swap(s[0], s[1]); - return s; - } - }; - template - struct ShapeCheck >{ - inline static Shape Check( const Tensor &t ){ - return t.shape; - } - }; - template - struct ShapeCheck >{ - inline static Shape Check( const MakeTensorExp &t ){ - return t.shape_; - } - }; - template - struct ShapeCheck< dim,UnaryMapExp >{ - inline static Shape Check( const UnaryMapExp &t ){ - Shape s = ShapeCheck::Check( t.src_ ); - return s; - } - }; - template - struct ShapeCheck< dim, BinaryMapExp >{ - inline static Shape Check( const BinaryMapExp &t ){ - Shape shape1 = ShapeCheck::Check( t.lhs_ ); - Shape shape2 = ShapeCheck::Check( t.rhs_ ); - if( shape1[0] == 0 ) return shape2; - if( shape2[0] == 0 ) return shape1; - utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same"); - return shape1; - } - }; - }; // namespace expr - - // the matrix OP depends on BLAS - namespace expr{ - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ); - }; - - // handles the dot - template - struct BLASEngine; - - #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) - template<> - struct BLASEngine{ - inline static CBLAS_TRANSPOSE GetT( bool t ){ - return t ? CblasTrans : CblasNoTrans; - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \ - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){ - cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \ - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){ - cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY ){ - cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY ){ - cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){ - cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){ - cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - }; - #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL - - #if MSHADOW_USE_CUDA - // All CuBLAS goes to here, use legacy API: not threadsafe - template<> - struct BLASEngine{ - inline static char GetT( bool t ){ - return t ? 'T' : 'N'; - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){ - cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){ - cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY ){ - cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY ){ - cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){ - cublasSger(m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){ - cublasDger(m,n,alpha,X,incX,Y,incY,A,lda); - } - }; - #endif - - // helper function to decide which shape we are in - inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){ - return transpose ? Shape2(shape[0],shape[1]) : shape; - } - // dst = dot( lhs[.T], rhs[.T] ) - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ) { - Shape<2> sleft = GetShape( lhs.shape, transpose_left ); - Shape<2> sright = GetShape( rhs.shape, transpose_right ); - utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \ - && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" ); - // use column major argument to compatible with most BLAS - BLASEngine::gemm - ( transpose_right , transpose_left, - transpose_right ? rhs.shape[1] : rhs.shape[0], - transpose_left ? lhs.shape[0] : lhs.shape[1], - transpose_right ? rhs.shape[0] : rhs.shape[1], - scale * SV::kAlphaBLAS, - rhs.dptr, rhs.shape.stride_, - lhs.dptr, lhs.shape.stride_, - SV::kBetaBLAS, - dst.dptr, dst.shape.stride_ ); - } - }; - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ) { - Shape<2> sright = GetShape( rhs.shape, transpose_right ); - utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch"); - BLASEngine::gemv - ( transpose_right, - rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS, - rhs.dptr, rhs.shape.stride_, - lhs.dptr, 1, SV::kBetaBLAS, - dst.dptr, 1 ); - } - }; - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ) { - utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" ); - if( SV::kBetaBLAS < 1e-6f ){ - BLASEngine::ger - ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS, - rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ ); - }else{ - DotEngine::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale ); - } - } - }; - - }; // namespace expr - - namespace expr{ - /*! \brief some engine that evaluate complex expression */ - template - struct ExpComplexEngine{ - inline static void Eval( Tensor& dst, const E &exp ); - }; - template - struct ExpEngine >{ - template - inline static void Eval( Tensor& dst, const Exp &exp ){ - MapExp( dst, exp ); - } - template - inline static void Eval( Tensor& dst, const Exp &exp ){ - MapExp( dst, exp ); - } - template - inline static void Eval( Tensor& dst, const Exp &exp ){ - ExpComplexEngine::Eval( dst, exp.self() ); - } - }; - template - struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor, Tensor, ltrans, rtrans > >{ - inline static void Eval( Tensor &dst, const DotExp< Tensor, Tensor, ltrans, rtrans > &exp ){ - DotEngine::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ ); - } - }; - }; // namespace expr -}; -#endif From 465b7dc2a63719ea73c8d2e2011416237dddaa8e Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 24 Dec 2014 17:12:05 -0800 Subject: [PATCH 015/147] change data structure of tensor to add underscore to all memembers --- mshadow/expression-inl.h | 4 +- mshadow/expression.h | 3 ++ mshadow/tensor.h | 95 +++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 46 deletions(-) diff --git a/mshadow/expression-inl.h b/mshadow/expression-inl.h index b2897ca44faa..d07e3d5419a7 100644 --- a/mshadow/expression-inl.h +++ b/mshadow/expression-inl.h @@ -2,6 +2,8 @@ * \file expression-inl.h * \brief definitions of operators in expression with respect to scalar * this file will be included several times, each time with MACRO MSHADOW_SCALAR_ to be different types + * + * DO NOT add pragma once for macro guard * \author Tianqi Chen, Bing Xu */ namespace mshadow { @@ -16,7 +18,7 @@ operator*(const DotExp &lhs, MSHADOW_SC /*! \brief scale of dot operation */ template inline DotExp -operator*(MSHADOW_SCALAR_ &lhs, const DotExp &rhs) { +operator*(MSHADOW_SCALAR_ lhs, const DotExp &rhs) { return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); } diff --git a/mshadow/expression.h b/mshadow/expression.h index 7799781e162e..0acb861975e9 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -348,4 +348,7 @@ inline UnaryMapExp F(const Exp #define MSHADOW_SCALAR_ double #include "./expression-inl.h" #undef MSHADOW_SCALAR_ +#define MSHADOW_SCALAR_ int +#include "./expression-inl.h" +#undef MSHADOW_SCALAR_ #endif // MSHADOW_EXPRESSION_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index d54a7c9d4abc..b875f81415f9 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -43,11 +43,6 @@ struct Shape { static const int kSubdim = dimension - 1; /*! \brief storing the dimension information */ index_t shape_[kDimension]; - /*! - * \brief storing the stride information in x dimension - * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency - */ - index_t stride_; /*! \brief default constructor, do nothing */ MSHADOW_XINLINE Shape(void) {} /*! \brief constuctor */ @@ -56,7 +51,6 @@ struct Shape { for (int i = 0; i < kDimension; ++i) { this->shape_[i] = s[i]; } - this->stride_ = s.stride_; } /*! * \brief get corresponding index @@ -88,7 +82,6 @@ struct Shape { */ MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { Shape<2> s; - s.stride_ = this->stride_; s.shape_[1] = this->shape_[kDimension - 1]; index_t ymax = 1; #pragma unroll @@ -107,15 +100,6 @@ struct Shape { } return size; } - /*! \return memory size, including the aligned x dimension */ - MSHADOW_XINLINE size_t MSize(void) const { - size_t memsz = this->stride_; - #pragma unroll - for (int i = 0; i < kDimension - 1; ++i) { - memsz *= this->shape_[i]; - } - return memsz; - } /*! * \return product shape in [dimstart,dimend) * \param dimstart start dimension @@ -153,7 +137,7 @@ v * \return subshape * \return the shape construction */ MSHADOW_XINLINE Shape<1> Shape1(index_t s0) { - Shape<1> s; s[0] = s0; s.stride_ = s0; + Shape<1> s; s[0] = s0; return s; } /*! @@ -163,7 +147,7 @@ MSHADOW_XINLINE Shape<1> Shape1(index_t s0) { * \return the shape construction */ MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) { - Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s1; + Shape<2> s; s[0] = s0; s[1] = s1; return s; } /*! @@ -175,7 +159,7 @@ MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) { */ MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) { Shape<3> s; - s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s2; + s[0] = s0; s[1] = s1; s[2] = s2; return s; } /*! @@ -188,7 +172,7 @@ MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) { */ MSHADOW_XINLINE Shape<4> Shape4(index_t s3, index_t s2, index_t s1, index_t s0) { Shape<4> s; - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s3; + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; return s; } /*! @@ -228,38 +212,59 @@ struct Tensor: public TRValue, Device, dimensio // struct memembers //-------------------------------- /*! \brief pointer to the data */ - DType *dptr; + DType *dptr_; /*! \brief shape of the tensor */ - Shape shape; + Shape shape_; + /*! + * \brief storing the stride information in x dimension + * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency + */ + index_t stride_; /*! * \brief stream where the computation lies * stream is a device dependency concept where each computation */ - Stream *stream; + Stream *stream_; //-------------------------------- // functions //-------------------------------- /*! \brief default constructor */ - MSHADOW_XINLINE Tensor(void) : stream(NULL) {} + MSHADOW_XINLINE Tensor(void) {} /*! \brief constructor from shape */ - MSHADOW_XINLINE Tensor(const Shape &shape) : shape(shape), stream(NULL) {} - /*! \brief constructor from data pointer and shape */ + MSHADOW_XINLINE Tensor(const Shape &shape) : shape_(shape) {} + /*! \brief constructor from data pointer and shape, without stride */ MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape) - : dptr(dptr), shape(shape), stream(NULL) {} + : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {} + /*! \brief constructor from data pointer and shape */ + MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape, index_t stride) + : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} + /*! + * \return memory cost of the tensor, including the aligned x dimension + * \tparam startdim the starting dimension + */ + template + MSHADOW_XINLINE size_t MSize(void) const { + size_t memsz = this->stride_; + #pragma unroll + for (int i = startdim; i < kSubdim; ++i) { + memsz *= this->shape_[i]; + } + return memsz; + } /*! * \brief return size of i-th dimension, start counting from highest dimension * \param the dimension count from the highest dimensin * \return the size */ MSHADOW_XINLINE index_t size(index_t i) const { - return shape[i]; + return shape_[i]; } /*! * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together * \return tensor after flatten */ MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(dptr, shape.FlatTo2D()); + return Tensor(dptr_, shape_.FlatTo2D(), stride_); } /*! * \brief get a element of dimension - 1 @@ -267,8 +272,8 @@ struct Tensor: public TRValue, Device, dimensio * \return the result tensor */ MSHADOW_XINLINE Tensor operator[](index_t idx) const { - Shape s = shape.SubShape(); - return Tensor(dptr + s.MSize() * idx, s); + return Tensor(dptr_ + this->MSize<1>() * idx, + shape_.SubShape(), stride_); } /*! * \brief slice the tensor in highest dimension [begin,end) @@ -279,7 +284,7 @@ struct Tensor: public TRValue, Device, dimensio MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { Shape s = this->shape; s[0] = end - begin; - return Tensor(dptr + s.SubShape().MSize() * begin, s); + return Tensor(dptr_ + this->MSize<1>() * begin, s, stride_); } /*!\brief functions to fit expression template */ template @@ -306,27 +311,27 @@ struct Tensor: public TRValue, Device, dimensio template struct Tensor: public expr::RValueExp, DType> { public: - DType *dptr; - Shape<1> shape; - Stream *stream; + DType *dptr_; + Shape<1> shape_; + Stream *stream_; // constructor - MSHADOW_XINLINE Tensor(void) : stream(NULL) {} - MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape), stream(NULL) {} - MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) - : dptr(dptr), shape(shape), stream(NULL) {} + MSHADOW_XINLINE Tensor(void) {} + MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape_(shape) {} + MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) + : dptr_(dptr), shape_(shape), stream_(NULL) {} MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(dptr, shape.FlatTo2D()); + return Tensor(dptr_, shape_.FlatTo2D(), shape_[0]); } MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { Shape<1> s; - s[0] = s.stride_ = end - begin; - return Tensor(dptr + begin, s); + s[0] = end - begin; + return Tensor(dptr_ + begin, s); } MSHADOW_XINLINE index_t size(index_t i) const { - return shape[0]; + return shape_[0]; } - MSHADOW_XINLINE DType &operator[](index_t idx) { return dptr[idx]; } - MSHADOW_XINLINE const DType &operator[](index_t idx)const { return dptr[idx]; } + MSHADOW_XINLINE DType &operator[](index_t idx) { return dptr_[idx]; } + MSHADOW_XINLINE const DType &operator[](index_t idx)const { return dptr_[idx]; } template inline Tensor &operator=(const expr::Exp &exp) { return this->__assign(exp); From 53a2a45b9af12440dc5d1add3202c642185fd7f5 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 24 Dec 2014 21:22:08 -0800 Subject: [PATCH 016/147] refactor engine --- mshadow/base.h | 14 +- mshadow/dot_engine-inl.h | 148 ++++++++++ mshadow/expr_engine-inl.h | 550 ++++++++++++++++---------------------- mshadow/tensor.h | 9 +- 4 files changed, 387 insertions(+), 334 deletions(-) create mode 100644 mshadow/dot_engine-inl.h diff --git a/mshadow/base.h b/mshadow/base.h index 94aadb95c7b2..60061ea0e933 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -182,10 +182,10 @@ struct saveto { MSHADOW_XINLINE static void Save(DType &a, DType b) { a = b; } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static default_real_t kAlphaBLAS = 1.0f; + /*! \brief helper constant to use BLAS, alpha */ + inline static default_real_t AlphaBLAS(void) { return 1.0f; } /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static default_real_t kBetaBLAS = 0.0f; + inline static default_real_t BetaBLAS(void) { return 0.0f; } /*! \brief corresponding binary operator type */ typedef op::right OPType; }; @@ -197,9 +197,9 @@ struct plusto { a += b; } /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static default_real_t kAlphaBLAS = 1.0f; + inline static default_real_t AlphaBLAS(void) { return 1.0f; } /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static default_real_t kBetaBLAS = 1.0f; + inline static default_real_t BetaBLAS(void) { return 1.0f; } /*! \brief corresponding binary operator type */ typedef op::plus OPType; }; @@ -211,9 +211,9 @@ struct minusto { a -= b; } /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static default_real_t kAlphaBLAS = -1.0f; + inline static default_real_t AlphaBLAS(void) { return -1.0f; } /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static default_real_t kBetaBLAS = 1.0f; + inline static default_real_t BetaBLAS(void) { return 1.0f; } /*! \brief corresponding binary operator type */ typedef op::minus OPType; }; diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h new file mode 100644 index 000000000000..62ba584b9038 --- /dev/null +++ b/mshadow/dot_engine-inl.h @@ -0,0 +1,148 @@ +#ifndef MSHADOW_DOT_ENGINE_INL_H_ +#define MSHADOW_DOT_ENGINE_INL_H_ +/*! + * \file dot_engine-inl.h + * \brief definitions of how Matrix Multiplications can be evaluated + * \author Tianqi Chen + */ +namespace mshadow { +namespace expr { +//--------------------------------------------------------------------- +// Matrix Multiplications, depends on BLAS Engine +//--------------------------------------------------------------------- +template +struct DotEngine { + inline static void Eval(Tensor &dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale); +}; +// handles the dot +template +struct BLASEngine; +#if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) +template<> +struct BLASEngine { + inline static CBLAS_TRANSPOSE GetT(bool t) { + return t ? CblasTrans : CblasNoTrans; + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, \ + const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { + cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, \ + const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { + cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ + const float *X, int incX, float beta, float *Y, int incY) { + cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ + const double *X, int incX, double beta, double *Y, int incY) { + cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { + cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + } + inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { + cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + } +}; +#endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL +// CuBLAS redirect code +#if MSHADOW_USE_CUDA +// All CuBLAS goes to here, use legacy API: not threadsafe +template<> +struct BLASEngine { + inline static char GetT(bool t) { + return t ? 'T' : 'N'; + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { + cublasSgemm(GetT(transa), GetT(transb), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, + const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { + cublasDgemm(GetT(transa), GetT(transb), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ + const float *X, int incX, float beta, float *Y, int incY) { + cublasSgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ + const double *X, int incX, double beta, double *Y, int incY) { + cublasDgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY); + } + inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { + cublasSger(m, n, alpha, X, incX, Y, incY, A, lda); + } + inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { + cublasDger(m, n, alpha, X, incX, Y, incY, A, lda); + } +}; +#endif +// helper function to decide which shape we are in +inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { + return transpose ? Shape2(shape[0], shape[1]) : shape; +} +// dst = dot(lhs[.T], rhs[.T]) +template +struct DotEngine { + inline static void Eval(Tensor &dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale) { + Shape<2> sleft = GetShape(lhs.shape, transpose_left); + Shape<2> sright = GetShape(rhs.shape, transpose_right); + utils::Check(dst.size(0) == sleft[0] && dst.size(1) == sright[1] \ + && sleft[1] == sright[0] , "dot-gemm: matrix shape mismatch"); + // use column major argument to compatible with most BLAS + BLASEngine::gemm + (transpose_right , transpose_left, + transpose_right ? rhs.size(0) : rhs.size(1), + transpose_left ? lhs.size(1) : lhs.size(0), + transpose_right ? rhs.size(1) : rhs.size(0), + scale * SV::AlphaBLAS(), + rhs.dptr_, rhs.stride_, + lhs.dptr_, lhs.stride_, + SV::BetaBLAS(), + dst.dptr_, dst.stride_); + } +}; +template +struct DotEngine { + inline static void Eval(Tensor &dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale) { + Shape<2> sright = GetShape(rhs.shape, transpose_right); + utils::Check(dst.size(0) == sright[1] && lhs.size(0) == sright[0], + "dot-gemv: matrix shape mismatch"); + BLASEngine::gemv + (transpose_right, + rhs.size(1), rhs.size(0), scale * SV::AlphaBLAS(), + rhs.dptr_, rhs.stride_, + lhs.dptr_, 1, SV::BetaBLAS(), + dst.dptr_, 1); + } +}; +template +struct DotEngine { + inline static void Eval(Tensor &dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale) { + utils::Assert(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0), "dot-ger: matrix shape mismatch"); + if (SV::kBetaBLAS == 0.0f) { + BLASEngine::ger + (rhs.size(0), lhs.size(0), scale * SV::AlphaBLAS(), + rhs.dptr_, 1, lhs.dptr_, 1, dst.dptr_, dst.stride_); + } else { + DotEngine::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale); + } + } +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_DOT_ENGINE_INL_H_ diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 5ddfd53afd39..8cbbdfd5b0b1 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -1,10 +1,12 @@ #ifndef MSHADOW_EXPR_ENGINE_INL_H_ #define MSHADOW_EXPR_ENGINE_INL_H_ /*! - * \file texpr_engine-inl.h + * \file expr_engine-inl.h * \brief definitions of how expressions should be evaluated * \author Tianqi Chen, Bing Xu */ +#include +#include "./utils.h" #include "./expression.h" #include "./tensor.h" @@ -15,9 +17,11 @@ namespace expr { * \tparam SubType type of subclass * \tparam SrcExp source expression of the MakeTensorExp, the source of operation * \tparam dim dimension of the expression + * \tparam DType the type of elements */ -template -struct MakeTensorExp: public Exp< MakeTensorExp, type::kChainer> { +template +struct MakeTensorExp + : public Exp, DType, type::kChainer> { /*! \brief the shape of this expression */ Shape shape_; /*! \brief true self of subtype */ @@ -54,7 +58,7 @@ class Plan, DType> { } private: - real_t *dptr_; + DType *dptr_; index_t stride_; }; // special evaluation case for 1d tensor, no stride @@ -70,7 +74,7 @@ class Plan, DType> { } private: - real_t *dptr_; + DType *dptr_; }; // scalar template @@ -86,375 +90,273 @@ class Plan, DType> { }; // binary expression template -class Plan, DType> { +class Plan, DType> { public: - Plan(const Plan &lhs, const Plan &rhs) + Plan(const Plan &lhs, const Plan &rhs) : lhs_(lhs), rhs_(rhs) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); } private: - Plan lhs_; - Plan rhs_; + Plan lhs_; + Plan rhs_; }; // unary expression template -class Plan, DType> { +class Plan, DType> { public: - Plan(const Plan &src) : src_(src) {} + Plan(const Plan &src) : src_(src) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return OP::Map(src_.Eval(y, x)); } private: - Plan src_; + Plan src_; }; -// remaps map tensor expression to subtype's plan -template -struct Plan< MakeTensorExp > { +// remaps map tensor expression to subtype's plan +template +struct Plan, DType> { public: - Plan(const Plan &src):src_(src) {} - MSHADOW_XINLINE real_t Eval(index_t y, index_t x) const { + Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return src_.Eval(y, x); } private: - Plan src_; + Plan src_; }; // tranpsoe template -class Plan, DType> { +class Plan, DType> { public: - Plan(const Plan &src) : src_(src) {} + Plan(const Plan &src) : src_(src) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return src_.Eval(x, y); } private: - Plan src_; + Plan src_; }; //---------------------------------------------------------------------- // Mappings from expression to plans //--------------------------------------------------------------------- template -inline Plan, DType> MakePlan(const BinaryMapExp &e); +inline Plan, DType> +MakePlan(const BinaryMapExp &e); template -inline Plan MakePlan(const ScalarExp &e) { - return Plan(e.scalar_); +inline Plan, DType> MakePlan(const ScalarExp &e) { + return Plan, DType>(e.scalar_); } template -inline Plan MakePlan(const RValueExp &e) { - return Plan(e.self()); +inline Plan MakePlan(const RValueExp &e) { + return Plan(e.self()); } template -inline Plan, DType> MakePlan(const TransposeExp &e) { - return Plan >(MakePlan(e.exp)); +inline Plan, DType> +MakePlan(const TransposeExp &e) { + return Plan, DType>(MakePlan(e.exp)); } -template -inline Plan< T > MakePlan(const MakeTensorExp &e) { - return Plan< T >(e.real_self()); +template +inline Plan +MakePlan(const MakeTensorExp &e) { + return Plan(e.real_self()); } -template -inline Plan< UnaryMapExp > MakePlan(const UnaryMapExp &e) { - return Plan< UnaryMapExp >(MakePlan(e.src_)); +template +inline Plan, DType> +MakePlan(const UnaryMapExp &e) { + return Plan, DType>(MakePlan(e.src_)); } -template -inline Plan< BinaryMapExp > MakePlan(const BinaryMapExp &e) { - return Plan< BinaryMapExp >(MakePlan(e.lhs_), MakePlan(e.rhs_)); +template +inline Plan, DType> +MakePlan(const BinaryMapExp &e) { + return Plan, DType>(MakePlan(e.lhs_), MakePlan(e.rhs_)); } -}; // namespace expr - - namespace expr{ - /*! - * \brief static type inference template, - * used to get the dimension of each expression, - * if ExpInfo::kDim == -1, this means here are mismatch in expression - * if (ExpInfo::kDevMask & cpu::kDevMask) != 0, this means this expression can be assigned to cpu - * \tparam E expression - */ - template - struct ExpInfo{ - const static int kDim = -1; - const static int kDevMask = 0; - }; - template<> - struct ExpInfo { - const static int kDim = 0; - const static int kDevMask = 0xffff; - }; - template - struct ExpInfo > { - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< Tensor > { - const static int kDim = dim; - const static int kDevMask = Device::kDevMask; - }; - template - struct ExpInfo< MakeTensorExp > { - const static int kDimSrc = ExpInfo::kDim; - const static int kDim = kDimSrc >= 0 ? dim : -1; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< UnaryMapExp > { - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< BinaryMapExp > { - const static int kDimLhs = ExpInfo::kDim; - const static int kDimRhs = ExpInfo::kDim; - const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ - (kDimLhs==0 ? kDimRhs : ((kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1)):-1; - const static int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; - }; - - /*! \brief template to do type check */ - template - struct TypeCheck{ - /*! \brief dimension of expression*/ - const static int kExpDim = ExpInfo::kDim; - /*! \brief whether the expression device type matches */ - const static bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; - /*! \brief whether the expression can be mapped to expression of dim */ - const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; - /*! \brief whether the expression can be reduced to expression of dim */ - const static bool kRedPass = (kExpDim > dim) && kDevPass; - }; - - template - struct TypeCheckPass; - template<> - struct TypeCheckPass {}; - template<> - struct TypeCheckPass { - inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {} - inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {} - inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {} - }; - }; // namespace expr - - namespace expr{ - // check shape consistency - template - struct ShapeCheck{ - inline static Shape Check(const E &t); - }; - - template - struct ShapeCheck { - inline static Shape Check(const ScalarExp &exp) { - // use lowest dimension to mark scalar exp - Shape shape; shape[0] = 0; - return shape; - } - }; - template - struct ShapeCheck > { - inline static Shape Check(const TransposeExp< E > &e) { - // swap the lowest two dimensions - Shape s = ShapeCheck::Check(e.exp); - std::swap(s[0], s[1]); - return s; - } - }; - template - struct ShapeCheck > { - inline static Shape Check(const Tensor &t) { - return t.shape; - } - }; - template - struct ShapeCheck > { - inline static Shape Check(const MakeTensorExp &t) { - return t.shape_; - } - }; - template - struct ShapeCheck< dim,UnaryMapExp > { - inline static Shape Check(const UnaryMapExp &t) { - Shape s = ShapeCheck::Check(t.src_); - return s; - } - }; - template - struct ShapeCheck< dim, BinaryMapExp > { - inline static Shape Check(const BinaryMapExp &t) { - Shape shape1 = ShapeCheck::Check(t.lhs_); - Shape shape2 = ShapeCheck::Check(t.rhs_); - if(shape1[0] == 0) return shape2; - if(shape2[0] == 0) return shape1; - utils::Assert(shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same"); - return shape1; - } - }; - }; // namespace expr - - // the matrix OP depends on BLAS - namespace expr{ - template - struct DotEngine{ - inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale); - }; - - // handles the dot - template - struct BLASEngine; - - #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) - template<> - struct BLASEngine { - inline static CBLAS_TRANSPOSE GetT(bool t) { - return t ? CblasTrans : CblasNoTrans; - } - inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, \ - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { - cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, \ - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { - cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY) { - cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY) { - cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { - cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { - cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - }; - #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL - - #if MSHADOW_USE_CUDA - // All CuBLAS goes to here, use legacy API: not threadsafe - template<> - struct BLASEngine { - inline static char GetT(bool t) { - return t ? 'T' : 'N'; - } - inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { - cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { - cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY) { - cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY) { - cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { - cublasSger(m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { - cublasDger(m,n,alpha,X,incX,Y,incY,A,lda); - } - }; - #endif - - // helper function to decide which shape we are in - inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { - return transpose ? Shape2(shape[0],shape[1]) : shape; - } - // dst = dot(lhs[.T], rhs[.T]) - template - struct DotEngine { - inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale) { - Shape<2> sleft = GetShape(lhs.shape, transpose_left); - Shape<2> sright = GetShape(rhs.shape, transpose_right); - utils::Assert(dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \ - && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch"); - // use column major argument to compatible with most BLAS - BLASEngine::gemm - (transpose_right , transpose_left, - transpose_right ? rhs.shape[1] : rhs.shape[0], - transpose_left ? lhs.shape[0] : lhs.shape[1], - transpose_right ? rhs.shape[0] : rhs.shape[1], - scale * SV::kAlphaBLAS, - rhs.dptr, rhs.shape.stride_, - lhs.dptr, lhs.shape.stride_, - SV::kBetaBLAS, - dst.dptr, dst.shape.stride_); - } - }; - template - struct DotEngine { - inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale) { - Shape<2> sright = GetShape(rhs.shape, transpose_right); - utils::Assert(dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch"); - BLASEngine::gemv - (transpose_right, - rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS, - rhs.dptr, rhs.shape.stride_, - lhs.dptr, 1, SV::kBetaBLAS, - dst.dptr, 1); - } - }; - template - struct DotEngine { - inline static void Eval(Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale) { - utils::Assert(dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch"); - if(SV::kBetaBLAS < 1e-6f) { - BLASEngine::ger - (rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS, - rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_); - }else{ - DotEngine::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale); - } - } - }; - - }; // namespace expr +//---------------------------------------------------------------- +// Static Type inference and Type Checking +//---------------------------------------------------------------- +/*! + * \brief static type inference template, + * used to get the dimension of each expression, + * if ExpInfo::kDim == -1, this means here are mismatch in expression + * if (ExpInfo::kDevMask & cpu::kDevMask) != 0, this means this expression can be assigned to cpu + * \tparam E expression + */ +template +struct ExpInfo { + const static int kDim = -1; + const static int kDevMask = 0; + const static int kTypeMask = 0; +}; +template +struct ExpInfo< ScalarExp > { + const static int kDim = 0; + const static int kDevMask = 0xffff; +}; +template +struct ExpInfo > { + const static int kDim = ExpInfo::kDim; + const static int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + const static int kDim = dim; + const static int kDevMask = Device::kDevMask; +}; +template +struct ExpInfo > { + const static int kDimSrc = ExpInfo::kDim; + const static int kDim = kDimSrc >= 0 ? dim : -1; + const static int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + const static int kDim = ExpInfo::kDim; + const static int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + const static int kDimLhs = ExpInfo::kDim; + const static int kDimRhs = ExpInfo::kDim; + const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ + (kDimLhs==0 ? kDimRhs : ((kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1)):-1; + const static int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; +}; +/*! \brief template to do type check */ +template +struct TypeCheck { + /*! \brief dimension of expression*/ + const static int kExpDim = ExpInfo::kDim; + /*! \brief whether the expression device type matches */ + const static bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; + /*! \brief whether the expression can be mapped to expression of dim */ + const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; + /*! \brief whether the expression can be reduced to expression of dim */ + const static bool kRedPass = (kExpDim > dim) && kDevPass; +}; +/*! \brief used to help static type check*/ +template +struct TypeCheckPass; +// Todo : add static assert using C++11 +template<> +struct TypeCheckPass {}; +template<> +struct TypeCheckPass { + inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {} + inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {} + inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {} +}; +//---------------------------------------------------------------- +// Runtime Shape Checking +//---------------------------------------------------------------- +/*! + * \brief runtime shape checking template + * get the shape of an expression, report error if shape mismatch + * \tparam dim the dimension of the shape + * \tparam E expression + */ +template +struct ShapeCheck{ + inline static Shape Check(const E &t); +}; +template +struct ShapeCheck > { + inline static Shape Check(const ScalarExp &exp) { + // use lowest dimension to mark scalar exp + Shape shape; shape[0] = 0; + return shape; + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const TransposeExp &e) { + // swap the lowest two dimensions + Shape s = ShapeCheck::Check(e.exp); + std::swap(s[0], s[1]); + return s; + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const Tensor &t) { + return t.shape_; + } +}; +template +struct ShapeCheck > { + inline static Shape + Check(const MakeTensorExp &t) { + return t.shape_; + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const UnaryMapExp &t) { + Shape s = ShapeCheck::Check(t.src_); + return s; + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const BinaryMapExp &t) { + Shape shape1 = ShapeCheck::Check(t.lhs_); + Shape shape2 = ShapeCheck::Check(t.rhs_); + if (shape1[0] == 0) return shape2; + if (shape2[0] == 0) return shape1; + utils::Check(shape1 == shape2, + "BinaryMapExp: Shapes of two operands in BinaryMapExp are not the same"); + return shape1; + } +}; +} // namespace expr +} // namespace mshadow +// include definition of dot engine +#include "./dot_engine-inl.h" - namespace expr{ - /*! \brief some engine that evaluate complex expression */ - template - struct ExpComplexEngine{ - inline static void Eval(Tensor& dst, const E &exp); - }; - template - struct ExpEngine > { - template - inline static void Eval(Tensor& dst, const Exp &exp) { - MapExp(dst, exp); - } - template - inline static void Eval(Tensor& dst, const Exp &exp) { - MapExp(dst, exp); - } - template - inline static void Eval(Tensor& dst, const Exp &exp) { - ExpComplexEngine::Eval(dst, exp.self()); - } - }; - template - struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor, Tensor, ltrans, rtrans > > { - inline static void Eval(Tensor &dst, const DotExp< Tensor, Tensor, ltrans, rtrans > &exp) { - DotEngine::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_); - } - }; - }; // namespace expr +namespace mshadow { +namespace expr { +/*! \brief some engine that evaluate complex expression */ +template +struct ExpComplexEngine { + inline static void Eval(Tensor& dst, const E &exp); +}; +/*! \brief the engine that dispatches simple operations*/ +template +struct ExpEngine > { + template + inline static void Eval(Tensor& dst, const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(Tensor& dst, const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(Tensor& dst, const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(Tensor& dst, const Exp &exp) { + ExpComplexEngine::Eval(dst, exp.self()); + } +}; +template +struct ExpComplexEngine, Tensor, ltrans, rtrans, DType>, + DType> { + inline static void Eval(Tensor &dst, + const DotExp, + Tensor, + ltrans, rtrans, DType> &exp) { + DotEngine::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_); + } }; -#endif +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXPR_ENGINE_INL_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index b875f81415f9..ad40037712ab 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -312,15 +312,16 @@ template struct Tensor: public expr::RValueExp, DType> { public: DType *dptr_; - Shape<1> shape_; + Shape<1> shape_; + index_t stride_; Stream *stream_; // constructor MSHADOW_XINLINE Tensor(void) {} MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape_(shape) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) - : dptr_(dptr), shape_(shape), stream_(NULL) {} + : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(dptr_, shape_.FlatTo2D(), shape_[0]); + return Tensor(dptr_, shape_.FlatTo2D(), stride_); } MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { Shape<1> s; @@ -486,4 +487,6 @@ inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp< template inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); } // namespace mshadow + +#include "./expr_engine-inl.h" #endif // TENSOR_H From 529b095ad3644edca50dfdae7f982b51aaaf57c2 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 24 Dec 2014 22:11:37 -0800 Subject: [PATCH 017/147] cpplint --- mshadow/base.h | 19 +++-- mshadow/dot_engine-inl.h | 142 ++++++++++++++++++++++++-------------- mshadow/expr_engine-inl.h | 124 ++++++++++++++++++--------------- mshadow/expression-inl.h | 66 ++++++++++++------ mshadow/expression.h | 105 ++++++++++++++++------------ mshadow/tensor.h | 125 ++++++++++++++++++++++----------- mshadow/utils.h | 8 ++- 7 files changed, 365 insertions(+), 224 deletions(-) diff --git a/mshadow/base.h b/mshadow/base.h index 60061ea0e933..71f7a0b8382b 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -1,18 +1,22 @@ -#ifndef MSHADOW_BASE_H_ -#define MSHADOW_BASE_H_ /*! + * Copyright (c) 2014 by Contributors * \file base.h * \brief definitions of base types, operators, macros functions * * \author Bing Xu, Tianqi Chen */ +#ifndef MSHADOW_BASE_H_ +#define MSHADOW_BASE_H_ #include #include #include #include #include // macro defintiions -/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */ +/*! + * \brief if this macro is define to be 1, + * mshadow should compile without any of other libs + */ #ifndef MSHADOW_STAND_ALONE #define MSHADOW_STAND_ALONE 1 #endif @@ -30,7 +34,7 @@ #endif #if MSHADOW_STAND_ALONE - #define MSHADOW_USE_CBLAS 0 + #define MSHADOW_USE_CBLAS 1 #define MSHADOW_USE_MKL 0 #define MSHADOW_USE_CUDA 0 #endif @@ -88,7 +92,7 @@ extern "C" { #include #endif // -------------------------------- -// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code. +// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code #ifdef MSHADOW_XINLINE #error "MSHADOW_XINLINE must not be defined" #endif @@ -162,7 +166,8 @@ struct right { } }; // unary operator/ function: example -// these operators can be defined by user, in the same style as binary and unary operator +// these operators can be defined by user, +// in the same style as binary and unary operator // to use, simply write F( src ) /*! \brief identity function that maps a real number to it self */ struct identity{ @@ -182,7 +187,7 @@ struct saveto { MSHADOW_XINLINE static void Save(DType &a, DType b) { a = b; } - /*! \brief helper constant to use BLAS, alpha */ + /*! \brief helper constant to use BLAS, alpha */ inline static default_real_t AlphaBLAS(void) { return 1.0f; } /*! \brief helper constant to use BLAS, beta */ inline static default_real_t BetaBLAS(void) { return 0.0f; } diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h index 62ba584b9038..2d0f9e240627 100644 --- a/mshadow/dot_engine-inl.h +++ b/mshadow/dot_engine-inl.h @@ -1,18 +1,20 @@ -#ifndef MSHADOW_DOT_ENGINE_INL_H_ -#define MSHADOW_DOT_ENGINE_INL_H_ /*! + * Copyright (c) 2014 by Contributors * \file dot_engine-inl.h * \brief definitions of how Matrix Multiplications can be evaluated * \author Tianqi Chen */ +#ifndef MSHADOW_DOT_ENGINE_INL_H_ +#define MSHADOW_DOT_ENGINE_INL_H_ namespace mshadow { namespace expr { //--------------------------------------------------------------------- // Matrix Multiplications, depends on BLAS Engine //--------------------------------------------------------------------- -template +template struct DotEngine { - inline static void Eval(Tensor &dst, + inline static void Eval(Tensor *p_dst, const Tensor &lhs, const Tensor &rhs, DType scale); @@ -20,36 +22,52 @@ struct DotEngine { // handles the dot template struct BLASEngine; -#if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) +#if (MSHADOW_USE_CBLAS || MSHADOW_USE_MKL) template<> struct BLASEngine { inline static CBLAS_TRANSPOSE GetT(bool t) { return t ? CblasTrans : CblasNoTrans; } - inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, \ - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { - cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, \ - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { - cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY) { - cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY) { - cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { - cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { - cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + inline static void gemm(bool transa, bool transb, + int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, + float beta, float *C, int ldc) { + cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), + m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + inline static void gemm(bool transa, bool transb, + int m, int n, int k, double alpha, + const double *A, int lda, const double *B, int ldb, + double beta, double *C, int ldc) { + cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), + m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + inline static void gemv(bool trans, int m, int n, + float alpha, const float *A, int lda, + const float *X, int incX, + float beta, float *Y, int incY) { + cblas_sgemv(CblasColMajor, GetT(trans), m, n, alpha, + A, lda, X, incX, beta, Y, incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, + const double *A, int lda, + const double *X, int incX, + double beta, double *Y, int incY) { + cblas_dgemv(CblasColMajor, GetT(trans), m, n, alpha, + A, lda, X, incX, beta, Y, incY); + } + inline static void ger(int m, int n, float alpha, + const float *X, int incX, + const float *Y, int incY, float *A, int lda) { + cblas_sger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda); + } + inline static void ger(int m, int n, double alpha, + const double *X, int incX, + const double *Y, int incY, double *A, int lda) { + cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda); } }; -#endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL +#endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL // CuBLAS redirect code #if MSHADOW_USE_CUDA // All CuBLAS goes to here, use legacy API: not threadsafe @@ -58,43 +76,61 @@ struct BLASEngine { inline static char GetT(bool t) { return t ? 'T' : 'N'; } - inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { - cublasSgemm(GetT(transa), GetT(transb), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - } - inline static void gemm(bool transa, bool transb, int m, int n, int k, double alpha, - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc) { - cublasDgemm(GetT(transa), GetT(transb), m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - } - inline static void gemv(bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY) { + inline static void gemm(bool transa, bool transb, + int m, int n, int k, float alpha, + const float *A, int lda, + const float *B, int ldb, float beta, + float *C, int ldc) { + cublasSgemm(GetT(transa), GetT(transb), m, n, k, alpha, + A, lda, B, ldb, beta, C, ldc); + } + inline static void gemm(bool transa, bool transb, + int m, int n, int k, double alpha, + const double *A, int lda, + const double *B, int ldb, + double beta, double *C, int ldc) { + cublasDgemm(GetT(transa), GetT(transb), m, n, k, alpha, + A, lda, B, ldb, beta, C, ldc); + } + inline static void gemv(bool trans, int m, int n, float alpha, + const float *A, int lda, + const float *X, int incX, float beta, + float *Y, int incY) { cublasSgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY); } - inline static void gemv(bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY) { + inline static void gemv(bool trans, int m, int n, double alpha, + const double *A, int lda, + const double *X, int incX, + double beta, double *Y, int incY) { cublasDgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY); } - inline static void ger(int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda) { + inline static void ger(int m, int n, float alpha, + const float *X, int incX, + const float *Y, int incY, float *A, int lda) { cublasSger(m, n, alpha, X, incX, Y, incY, A, lda); } - inline static void ger(int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda) { + inline static void ger(int m, int n, double alpha, + const double *X, int incX, + const double *Y, int incY, double *A, int lda) { cublasDger(m, n, alpha, X, incX, Y, incY, A, lda); } }; #endif -// helper function to decide which shape we are in +// helper function to decide which shape we are in inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { return transpose ? Shape2(shape[0], shape[1]) : shape; } // dst = dot(lhs[.T], rhs[.T]) -template +template struct DotEngine { - inline static void Eval(Tensor &dst, + inline static void Eval(Tensor *p_dst, const Tensor &lhs, const Tensor &rhs, DType scale) { - Shape<2> sleft = GetShape(lhs.shape, transpose_left); - Shape<2> sright = GetShape(rhs.shape, transpose_right); + Tensor &dst = *p_dst; + Shape<2> sleft = GetShape(lhs.shape_, transpose_left); + Shape<2> sright = GetShape(rhs.shape_, transpose_right); utils::Check(dst.size(0) == sleft[0] && dst.size(1) == sright[1] \ && sleft[1] == sright[0] , "dot-gemm: matrix shape mismatch"); // use column major argument to compatible with most BLAS @@ -106,16 +142,17 @@ struct DotEngine { scale * SV::AlphaBLAS(), rhs.dptr_, rhs.stride_, lhs.dptr_, lhs.stride_, - SV::BetaBLAS(), + SV::BetaBLAS(), dst.dptr_, dst.stride_); } }; template struct DotEngine { - inline static void Eval(Tensor &dst, + inline static void Eval(Tensor *p_dst, const Tensor &lhs, const Tensor &rhs, DType scale) { + Tensor &dst = *p_dst; Shape<2> sright = GetShape(rhs.shape, transpose_right); utils::Check(dst.size(0) == sright[1] && lhs.size(0) == sright[0], "dot-gemv: matrix shape mismatch"); @@ -129,17 +166,20 @@ struct DotEngine { }; template struct DotEngine { - inline static void Eval(Tensor &dst, + inline static void Eval(Tensor *p_dst, const Tensor &lhs, const Tensor &rhs, DType scale) { - utils::Assert(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0), "dot-ger: matrix shape mismatch"); + Tensor &dst = *p_dst; + utils::Check(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0), + "dot-ger: matrix shape mismatch"); if (SV::kBetaBLAS == 0.0f) { BLASEngine::ger (rhs.size(0), lhs.size(0), scale * SV::AlphaBLAS(), rhs.dptr_, 1, lhs.dptr_, 1, dst.dptr_, dst.stride_); } else { - DotEngine::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale); + DotEngine::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale); } } }; diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 8cbbdfd5b0b1..5448b6774b22 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -1,11 +1,13 @@ -#ifndef MSHADOW_EXPR_ENGINE_INL_H_ -#define MSHADOW_EXPR_ENGINE_INL_H_ /*! + * Copyright (c) 2014 by Contributors * \file expr_engine-inl.h * \brief definitions of how expressions should be evaluated * \author Tianqi Chen, Bing Xu */ +#ifndef MSHADOW_EXPR_ENGINE_INL_H_ +#define MSHADOW_EXPR_ENGINE_INL_H_ #include +#include #include "./utils.h" #include "./expression.h" #include "./tensor.h" @@ -21,7 +23,8 @@ namespace expr { */ template struct MakeTensorExp - : public Exp, DType, type::kChainer> { + : public Exp, + DType, type::kChainer> { /*! \brief the shape of this expression */ Shape shape_; /*! \brief true self of subtype */ @@ -30,7 +33,7 @@ struct MakeTensorExp } }; //---------------------------------------------------------------------- -// This part of code gives plan that can be used to carry out execution +// This part of code gives plan that can be used to carry out execution //--------------------------------------------------------------------- // Declarations of plans template @@ -46,7 +49,7 @@ class Plan { template class Plan, DType> { public: - Plan(const Tensor &t) + explicit Plan(const Tensor &t) : dptr_(t.dptr), stride_(t.shape.stride_) {} // for RValue, the return type should be reference MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { @@ -56,7 +59,7 @@ class Plan, DType> { MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const { return dptr_[y * stride_ + x]; } - + private: DType *dptr_; index_t stride_; @@ -65,7 +68,7 @@ class Plan, DType> { template class Plan, DType> { public: - Plan(const Tensor &t) : dptr_(t.dptr) {} + explicit Plan(const Tensor &t) : dptr_(t.dptr) {} MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { return dptr_[x]; } @@ -80,7 +83,7 @@ class Plan, DType> { template class Plan, DType> { public: - Plan(DType scalar) : scalar_(scalar) {} + explicit Plan(DType scalar) : scalar_(scalar) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return scalar_; } @@ -92,7 +95,7 @@ class Plan, DType> { template class Plan, DType> { public: - Plan(const Plan &lhs, const Plan &rhs) + explicit Plan(const Plan &lhs, const Plan &rhs) : lhs_(lhs), rhs_(rhs) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); @@ -106,7 +109,7 @@ class Plan, DType> { template class Plan, DType> { public: - Plan(const Plan &src) : src_(src) {} + explicit Plan(const Plan &src) : src_(src) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return OP::Map(src_.Eval(y, x)); } @@ -114,23 +117,23 @@ class Plan, DType> { private: Plan src_; }; -// remaps map tensor expression to subtype's plan +// remaps map tensor expression to subtype's plan template struct Plan, DType> { public: - Plan(const Plan &src) : src_(src) {} + explicit Plan(const Plan &src) : src_(src) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return src_.Eval(y, x); } private: - Plan src_; + Plan src_; }; // tranpsoe template class Plan, DType> { public: - Plan(const Plan &src) : src_(src) {} + explicit Plan(const Plan &src) : src_(src) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return src_.Eval(x, y); } @@ -176,7 +179,8 @@ MakePlan(const UnaryMapExp &e) { template inline Plan, DType> MakePlan(const BinaryMapExp &e) { - return Plan, DType>(MakePlan(e.lhs_), MakePlan(e.rhs_)); + return Plan, + DType>(MakePlan(e.lhs_), MakePlan(e.rhs_)); } //---------------------------------------------------------------- // Static Type inference and Type Checking @@ -190,55 +194,57 @@ MakePlan(const BinaryMapExp &e) { */ template struct ExpInfo { - const static int kDim = -1; - const static int kDevMask = 0; - const static int kTypeMask = 0; + static const int kDim = -1; + static const int kDevMask = 0; + static const int kTypeMask = 0; }; template struct ExpInfo< ScalarExp > { - const static int kDim = 0; - const static int kDevMask = 0xffff; + static const int kDim = 0; + static const int kDevMask = 0xffff; }; template struct ExpInfo > { - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; + static const int kDim = ExpInfo::kDim; + static const int kDevMask = ExpInfo::kDevMask; }; template struct ExpInfo > { - const static int kDim = dim; - const static int kDevMask = Device::kDevMask; + static const int kDim = dim; + static const int kDevMask = Device::kDevMask; }; template struct ExpInfo > { - const static int kDimSrc = ExpInfo::kDim; - const static int kDim = kDimSrc >= 0 ? dim : -1; - const static int kDevMask = ExpInfo::kDevMask; + static const int kDimSrc = ExpInfo::kDim; + static const int kDim = kDimSrc >= 0 ? dim : -1; + static const int kDevMask = ExpInfo::kDevMask; }; template struct ExpInfo > { - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; + static const int kDim = ExpInfo::kDim; + static const int kDevMask = ExpInfo::kDevMask; }; template struct ExpInfo > { - const static int kDimLhs = ExpInfo::kDim; - const static int kDimRhs = ExpInfo::kDim; - const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ - (kDimLhs==0 ? kDimRhs : ((kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1)):-1; - const static int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; + static const int kDimLhs = ExpInfo::kDim; + static const int kDimRhs = ExpInfo::kDim; + static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\ + (kDimLhs == 0 ?\ + kDimRhs :\ + ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1; + static const int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; }; /*! \brief template to do type check */ template struct TypeCheck { /*! \brief dimension of expression*/ - const static int kExpDim = ExpInfo::kDim; + static const int kExpDim = ExpInfo::kDim; /*! \brief whether the expression device type matches */ - const static bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; + static const bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; /*! \brief whether the expression can be mapped to expression of dim */ - const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; + static const bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; /*! \brief whether the expression can be reduced to expression of dim */ - const static bool kRedPass = (kExpDim > dim) && kDevPass; + static const bool kRedPass = (kExpDim > dim) && kDevPass; }; /*! \brief used to help static type check*/ template @@ -277,7 +283,7 @@ template struct ShapeCheck > { inline static Shape Check(const TransposeExp &e) { // swap the lowest two dimensions - Shape s = ShapeCheck::Check(e.exp); + Shape s = ShapeCheck::Check(e.exp); std::swap(s[0], s[1]); return s; } @@ -302,15 +308,17 @@ struct ShapeCheck > { return s; } }; -template +template struct ShapeCheck > { - inline static Shape Check(const BinaryMapExp &t) { + inline static Shape + Check(const BinaryMapExp &t) { Shape shape1 = ShapeCheck::Check(t.lhs_); Shape shape2 = ShapeCheck::Check(t.rhs_); if (shape1[0] == 0) return shape2; if (shape2[0] == 0) return shape1; utils::Check(shape1 == shape2, - "BinaryMapExp: Shapes of two operands in BinaryMapExp are not the same"); + "BinaryMapExp: Shapes of operands are not the same"); return shape1; } }; @@ -324,37 +332,45 @@ namespace expr { /*! \brief some engine that evaluate complex expression */ template struct ExpComplexEngine { - inline static void Eval(Tensor& dst, const E &exp); + inline static void Eval(Tensor *dst, const E &exp); }; /*! \brief the engine that dispatches simple operations*/ template struct ExpEngine > { template - inline static void Eval(Tensor& dst, const Exp &exp) { - MapExp(dst, exp); + inline static void Eval(Tensor *dst, + const Exp &exp) { + MapExp(*dst, exp); } template - inline static void Eval(Tensor& dst, const Exp &exp) { - MapExp(dst, exp); + inline static void Eval(Tensor *dst, + const Exp &exp) { + MapExp(*dst, exp); } template - inline static void Eval(Tensor& dst, const Exp &exp) { - MapExp(dst, exp); + inline static void Eval(Tensor *dst, + const Exp &exp) { + MapExp(*dst, exp); } template - inline static void Eval(Tensor& dst, const Exp &exp) { + inline static void Eval(Tensor *dst, + const Exp &exp) { ExpComplexEngine::Eval(dst, exp.self()); } }; -template +template struct ExpComplexEngine, Tensor, ltrans, rtrans, DType>, + DotExp, + Tensor, + ltrans, rtrans, DType>, DType> { - inline static void Eval(Tensor &dst, + inline static void Eval(Tensor *dst, const DotExp, Tensor, ltrans, rtrans, DType> &exp) { - DotEngine::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_); + DotEngine::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_); } }; } // namespace expr diff --git a/mshadow/expression-inl.h b/mshadow/expression-inl.h index d07e3d5419a7..283bc96d2676 100644 --- a/mshadow/expression-inl.h +++ b/mshadow/expression-inl.h @@ -1,9 +1,10 @@ /*! + * Copyright (c) 2014 by Contributors * \file expression-inl.h * \brief definitions of operators in expression with respect to scalar * this file will be included several times, each time with MACRO MSHADOW_SCALAR_ to be different types * - * DO NOT add pragma once for macro guard + * DO NOT add pragma once or macro guard * \author Tianqi Chen, Bing Xu */ namespace mshadow { @@ -12,75 +13,96 @@ namespace expr { /*! \brief dot operator def */ template inline DotExp -operator*(const DotExp &lhs, MSHADOW_SCALAR_ rhs) { - return DotExp(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs); +operator*(const DotExp &lhs, + MSHADOW_SCALAR_ rhs) { + return DotExp(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs); } /*! \brief scale of dot operation */ template inline DotExp -operator*(MSHADOW_SCALAR_ lhs, const DotExp &rhs) { - return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); +operator*(MSHADOW_SCALAR_ lhs, + const DotExp &rhs) { + return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); } /*! \brief operator overload for const */ template -inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> F(const Exp &lhs, MSHADOW_SCALAR_ rhs) { return MakeExp(lhs, scalar(rhs)); } /*! \brief operator overload for const */ template -inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> F(MSHADOW_SCALAR_ lhs, const Exp &rhs) { return MakeExp(scalar(lhs), rhs); } // constant operators /*! \brief operator overload */ template -inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> -operator+(const Exp &lhs, const MSHADOW_SCALAR_ &rhs) { +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator+(const Exp &lhs, + const MSHADOW_SCALAR_ &rhs) { return MakeExp(lhs, scalar(rhs)); } /*! \brief operator overload */ template -inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> -operator-(const Exp &lhs, const MSHADOW_SCALAR_ &rhs) { +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator-(const Exp &lhs, + const MSHADOW_SCALAR_ &rhs) { return MakeExp(lhs, scalar(rhs)); } /*! \brief operator overload */ template -inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> -operator*(const Exp &lhs, const ScalarExp &rhs) { +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator*(const Exp &lhs, + const ScalarExp &rhs) { return MakeExp(lhs, rhs); } /*! \brief operator overload */ template -inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> -operator/(const Exp &lhs, const MSHADOW_SCALAR_ &rhs) { +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator/(const Exp &lhs, + const MSHADOW_SCALAR_ &rhs) { return MakeExp(lhs, scalar(rhs)); } // constant operators 2 /*! \brief operator overload */ template -inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator+(MSHADOW_SCALAR_ lhs, const Exp &rhs) { +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator+(MSHADOW_SCALAR_ lhs, + const Exp &rhs) { return MakeExp(scalar(lhs), rhs); } /*! \brief operator overload */ template -inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator-(MSHADOW_SCALAR_ lhs, const Exp &rhs) { +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator-(MSHADOW_SCALAR_ lhs, + const Exp &rhs) { return MakeExp(scalar(lhs), rhs); } /*! \brief operator overload */ template -inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator*(MSHADOW_SCALAR_ lhs, const Exp &rhs) { +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator*(MSHADOW_SCALAR_ lhs, + const Exp &rhs) { return MakeExp(scalar(lhs), rhs); } /*! \brief operator overload */ template -inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> operator/(MSHADOW_SCALAR_ lhs, const Exp &rhs) { return MakeExp(scalar(lhs), rhs); } diff --git a/mshadow/expression.h b/mshadow/expression.h index 0acb861975e9..902b5d002017 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -1,10 +1,11 @@ -#ifndef MSHADOW_EXPRESSION_H_ -#define MSHADOW_EXPRESSION_H_ /*! + * Copyright (c) 2014 by Contributors * \file expression.h * \brief definitions of abstract expressions and expressions template * \author Tianqi Chen, Bing Xu */ +#ifndef MSHADOW_EXPRESSION_H_ +#define MSHADOW_EXPRESSION_H_ #include "./base.h" namespace mshadow { @@ -19,9 +20,15 @@ namespace expr { namespace type { // type expression type are defined as bitmask // subtype relationshop kRValue < kMapper < kPull < kComplex -/*! \brief this expression directly correspnds to a data class, can be used to assign data */ +/*! + * \brief this expression directly correspnds to a data class, + * can be used to assign data + */ const int kRValue = 0; -/*! \brief expression contains element-wise tensor operations, map a expression to same shape */ +/*! + * \brief expression contains element-wise tensor operations, + * map a expression to same shape + */ const int kMapper = 1; /*! * \brief expression that can be chained with other expressiones @@ -43,7 +50,7 @@ template struct ExpEngine { /*! \brief defines how expression exp can be evaluated and stored into dst */ template - inline static void Eval(RValue& dst, const EType &exp); + inline static void Eval(RValue *dst, const EType &exp); }; /*! * \brief base class for expression @@ -59,8 +66,8 @@ struct Exp { return *static_cast(this); } /*! \return reference of subtype instance of current class */ - inline SubType& refself(void) { - return *static_cast(this); + inline SubType* ptrself(void) { + return static_cast(this); } }; /*! @@ -71,8 +78,8 @@ template struct ScalarExp: public Exp, DType, type::kMapper> { /*! \brief scalar value */ DType scalar_; - /*! \brief constructor, must be implicit for implicit conversion */ - ScalarExp(DType scalar) : scalar_(scalar) {} + /*! \brief constructor */ + explicit ScalarExp(DType scalar) : scalar_(scalar) {} }; /*! \brief create an scalar expression */ template @@ -89,16 +96,18 @@ template struct TypecastExp: public Exp, DType, etype> { const EType &exp; /*! \brief constructor */ - explicit TypecastExp(const EType &e) : exp(e) {} + explicit TypecastExp(const EType &e) : exp(e) {} }; /*! \brief create an scalar expression */ template -inline TypecastExp tcast(const Exp &exp) { +inline TypecastExp +tcast(const Exp &exp) { return TypecastExp(exp.self()); } /*! \brief represent a transpose expression of a container */ template -struct TransposeExp: public Exp, DType, type::kChainer> { +struct TransposeExp: public Exp, + DType, type::kChainer> { /*! \brief expression to be transposed */ const EType &exp; /*! \brief constructor */ @@ -125,67 +134,67 @@ class RValueExp: public Exp { } /*! \brief operator overload */ inline Container &operator+=(DType s) { - ExpEngine::Eval(this->refself(), scalar(s)); - return this->refself(); + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); } inline Container &operator-=(DType s) { - ExpEngine::Eval(this->refself(), scalar(s)); - return this->refself(); + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); } inline Container &operator*=(DType s) { - ExpEngine::Eval(this->refself(), scalar(s)); - return this->refself(); + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); } inline Container &operator/=(DType s) { - ExpEngine::Eval(this->refself(), scalar(s)); - return this->refself(); + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); } /*! \brief operator overload */ inline Container &__assign(DType s) { - ExpEngine::Eval(this->refself(), scalar(s)); - return this->refself(); + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); } - /*! \brief implementation of operator=, note that we can not define container = container */ + /*! \brief we can not define container = container */ template inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } - /*! \brief implementation of operator=, note that we can not define conatiner = container */ + /*! \brief we can not define conatiner = container */ template inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } - /*! \brief implementation of operator=, note that we can not define container = container */ + /*! \brief we can not define container = container */ template inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } /*! \brief implementation of operator+= */ template inline Container &operator+=(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } /*! \brief implementation of operator-= */ template inline Container &operator-=(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } /*! \brief implementation of operator*= */ template inline Container &operator*=(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } /*! \brief implementation of operator/= */ template inline Container &operator/=(const Exp &exp) { - ExpEngine::Eval(this->refself(), exp.self()); - return this->refself(); + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); } }; /*! @@ -196,7 +205,8 @@ class RValueExp: public Exp { * \tparam rtrans whether rhs is transposed */ template -struct DotExp: public Exp, DType, type::kComplex> { +struct DotExp: public Exp, + DType, type::kComplex> { /*! \brief left operand */ const TA &lhs_; /*! \brief right operand */ @@ -243,7 +253,8 @@ dot(const TransposeExp &lhs, const TransposeExp &rhs) { * \tparam etype expression type, sa namespace::type */ template -struct BinaryMapExp: public Exp, DType, etype> { +struct BinaryMapExp: public Exp, + DType, etype> { /*! \brief left operand */ const TA &lhs_; /*! \brief right operand */ @@ -257,7 +268,8 @@ struct BinaryMapExp: public Exp, DType, e template inline BinaryMapExp MakeExp(const Exp &lhs, const Exp &rhs) { - return BinaryMapExp(lhs.self(), rhs.self()); + return BinaryMapExp(lhs.self(), rhs.self()); } /*! * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression @@ -310,7 +322,8 @@ operator/(const Exp &lhs, const Exp &rhs) { * \tparam etype expression type, sa namespace::type */ template -struct UnaryMapExp: public Exp, DType, etype> { +struct UnaryMapExp: public Exp, + DType, etype> { /*! \brief source expression */ const TA &src_; /*! \brief constructor */ @@ -319,7 +332,8 @@ struct UnaryMapExp: public Exp, DType, etype> /*! \brief make expression */ template -inline UnaryMapExp MakeExp(const Exp &src) { +inline UnaryMapExp +MakeExp(const Exp &src) { return UnaryMapExp(src.self()); } /*! @@ -331,7 +345,8 @@ inline UnaryMapExp MakeExp(const Exp -inline UnaryMapExp F(const Exp &src) { +inline UnaryMapExp +F(const Exp &src) { return MakeExp(src); } } // namespace expr diff --git a/mshadow/tensor.h b/mshadow/tensor.h index ad40037712ab..80c5b32d3f05 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -1,6 +1,5 @@ -#ifndef MSHADOW_TENSOR_H_ -#define MSHADOW_TENSOR_H_ /*! + * Copyright (c) 2014 by Contributors * \file tensor.h * \brief header file of tensor data structure and functions * This lib requires explicit memory allocation and de-allocation @@ -10,6 +9,8 @@ * For STL style tensor, see tensor_container.h * \author Bing Xu, Tianqi Chen */ +#ifndef MSHADOW_TENSOR_H_ +#define MSHADOW_TENSOR_H_ #include "./base.h" #include "./expression.h" @@ -83,7 +84,7 @@ struct Shape { MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { Shape<2> s; s.shape_[1] = this->shape_[kDimension - 1]; - index_t ymax = 1; + index_t ymax = 1; #pragma unroll for (int i = 0; i < kDimension - 1; ++i) { ymax *= this->shape_[i]; @@ -170,7 +171,8 @@ MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) { * \param s0 size of dimension 0 * \return the shape construction */ -MSHADOW_XINLINE Shape<4> Shape4(index_t s3, index_t s2, index_t s1, index_t s0) { +MSHADOW_XINLINE Shape<4> Shape4(index_t s3, index_t s2, + index_t s1, index_t s0) { Shape<4> s; s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; return s; @@ -199,7 +201,8 @@ struct TRValue: public expr::RValueExp { * \tparam DType the type of elements in the tensor */ template -struct Tensor: public TRValue, Device, dimension, DType> { +struct Tensor: public TRValue, + Device, dimension, DType> { public: //-------------------------------- // struct memembers @@ -227,7 +230,7 @@ struct Tensor: public TRValue, Device, dimensio Stream *stream_; //-------------------------------- // functions - //-------------------------------- + //-------------------------------- /*! \brief default constructor */ MSHADOW_XINLINE Tensor(void) {} /*! \brief constructor from shape */ @@ -236,7 +239,8 @@ struct Tensor: public TRValue, Device, dimensio MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape) : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {} /*! \brief constructor from data pointer and shape */ - MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape, index_t stride) + MSHADOW_XINLINE Tensor(DType *dptr, + const Shape &shape, index_t stride) : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} /*! * \return memory cost of the tensor, including the aligned x dimension @@ -281,24 +285,29 @@ struct Tensor: public TRValue, Device, dimensio * \param end end position of slice * \return tensor after slice */ - MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { + MSHADOW_XINLINE Tensor + Slice(index_t begin, index_t end) const { Shape s = this->shape; s[0] = end - begin; - return Tensor(dptr_ + this->MSize<1>() * begin, s, stride_); + return Tensor(dptr_ + this->MSize<1>() * begin, + s, stride_); } /*!\brief functions to fit expression template */ template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor & + operator=(const expr::Exp &exp) { return this->__assign(exp); } /*!\brief functions to fit expression template */ template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor & + operator=(const expr::Exp &exp) { return this->__assign(exp); } /*!\brief functions to fit expression template */ template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor & + operator=(const expr::Exp &exp) { return this->__assign(exp); } inline Tensor &operator=(const DType &exp) { @@ -309,7 +318,8 @@ struct Tensor: public TRValue, Device, dimensio * respecialized class Tensor1D, thei is due to different implementation in operator[] */ template -struct Tensor: public expr::RValueExp, DType> { +struct Tensor: + public expr::RValueExp, DType> { public: DType *dptr_; Shape<1> shape_; @@ -331,21 +341,28 @@ struct Tensor: public expr::RValueExp MSHADOW_XINLINE index_t size(index_t i) const { return shape_[0]; } - MSHADOW_XINLINE DType &operator[](index_t idx) { return dptr_[idx]; } - MSHADOW_XINLINE const DType &operator[](index_t idx)const { return dptr_[idx]; } + MSHADOW_XINLINE DType &operator[](index_t idx) { + return dptr_[idx]; + } + MSHADOW_XINLINE const DType &operator[](index_t idx) const { + return dptr_[idx]; + } template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor & + operator=(const expr::Exp &exp) { return this->__assign(exp); } template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor & + operator=(const expr::Exp &exp) { return this->__assign(exp); } template - inline Tensor &operator=(const expr::Exp &exp) { + inline Tensor & + operator=(const expr::Exp &exp) { return this->__assign(exp); } - inline Tensor &operator=(const DType &exp) { + inline Tensor &operator=(const DType &exp) { return this->__assign(exp); } }; @@ -377,10 +394,12 @@ inline void InitTensorEngine(int device_id = 0); * \tparam DType type of element in tensor */ template -inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); +inline void AllocSpace(Tensor *obj, + bool pad = MSHADOW_ALLOC_PAD); /*! \brief refer to comment of cpu ver \sa AllocSpace */ template -inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); +inline void AllocSpace(Tensor *obj, + bool pad = MSHADOW_ALLOC_PAD); /*! * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL * \param obj the tensor object @@ -388,10 +407,10 @@ inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PA * \tparam DType type of element in tensor */ template -inline void FreeSpace(Tensor &obj); +inline void FreeSpace(Tensor *obj); /*! \brief refer to comment of cpu ver \sa FreeSpace */ template -inline void FreeSpace(Tensor &obj); +inline void FreeSpace(Tensor *obj); /*! * \brief CPU/GPU: short cut to allocate and initialize a Tensor * \param shape: shape of tensor @@ -403,7 +422,9 @@ inline void FreeSpace(Tensor &obj); * \sa AllocSpace */ template -inline Tensor NewTensor(const Shape &shape, DType initv, bool pad = MSHADOW_ALLOC_PAD); +inline Tensor NewTensor(const Shape &shape, + DType initv, + bool pad = MSHADOW_ALLOC_PAD); /*! * \brief copy data from one tensor to another, with same shape * \param dst target tensor @@ -412,16 +433,20 @@ inline Tensor NewTensor(const Shape &shape, DType initv * \tparam DType type of element in tensor */ template -inline void Copy(Tensor dst, const Tensor &src); +inline void Copy(Tensor dst, + const Tensor &src); /*! \brief refer to comment of cpu ver \sa Copy */ template -inline void Copy(Tensor dst, const Tensor &src); +inline void Copy(Tensor dst, + const Tensor &src); /*! \brief refer to comment of cpu ver \sa Copy */ template -inline void Copy(Tensor dst, const Tensor &src); +inline void Copy(Tensor dst, + const Tensor &src); /*! \brief refer to comment of cpu ver \sa Copy */ template -inline void Copy(Tensor dst, const Tensor &src); +inline void Copy(Tensor dst, + const Tensor &src); /*! * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) * \param dst destination @@ -444,11 +469,15 @@ inline void Softmax(Tensor dst, const Tensor &energy); * \param exp expression * \sa namespace mshadow:sv, mshadow::op, mshadow::expr */ -template -inline void MapExp(TRValue dst, const expr::Exp &exp); +template +inline void MapExp(TRValue dst, + const expr::Exp &exp); /*! \brief refer to comment of cpu ver \sa MapExp */ -template -inline void MapExp(TRValue dst, const expr::Exp &exp); +template +inline void MapExp(TRValue dst, + const expr::Exp &exp); /*! * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) * \tparam Saver specify storage method @@ -462,11 +491,17 @@ inline void MapExp(TRValue dst, const expr::Exp -inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); +template +inline void MapReduceKeepLowest(TRValue dst, + const expr::Exp &exp, + DType scale = 1); /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ -template -inline void MapReduceKeepLowest(TRValue dst, const expr::Exp &exp, DType scale = 1); +template +inline void MapReduceKeepLowest(TRValue dst, + const expr::Exp &exp, + DType scale = 1); /*! * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) * \tparam Saver specify storage method @@ -481,12 +516,18 @@ inline void MapReduceKeepLowest(TRValue dst, const expr::Exp -inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); +template +inline void MapReduceKeepHighDim(TRValue dst, + const expr::Exp &exp, + DType scale = 1); /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ -template -inline void MapReduceKeepHighDim(TRValue dst, const expr::Exp &exp, DType scale = 1); +template +inline void MapReduceKeepHighDim(TRValue dst, + const expr::Exp &exp, + DType scale = 1); } // namespace mshadow - +// include headers #include "./expr_engine-inl.h" -#endif // TENSOR_H +#endif // MSHADOW_TENSOR_H_ diff --git a/mshadow/utils.h b/mshadow/utils.h index 33b038d2418a..6003f5562814 100644 --- a/mshadow/utils.h +++ b/mshadow/utils.h @@ -1,13 +1,15 @@ -#ifndef MSHADOW_UTILS_H_ -#define MSHADOW_UTILS_H_ /*! + * Copyright (c) 2014 by Contributors * \file utils.h * \brief simple utils for error and checkings * \author Tianqi Chen */ +#ifndef MSHADOW_UTILS_H_ +#define MSHADOW_UTILS_H_ #define _CRT_SECURE_NO_WARNINGS #include #include +#include #include namespace mshadow { /*! \brief namespace for helper utils of the project */ @@ -76,4 +78,4 @@ inline void Error(const char *fmt, ...) { } } // namespace utils } // namespace mshadow -#endif +#endif // MSHADOW_UTILS_H_ From b6eb0d1888f98a844b99529b943e605ba51d0c8d Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 25 Dec 2014 04:04:15 -0800 Subject: [PATCH 018/147] check in cpu basic ops --- doc/Doxyfile | 2 +- example/Makefile | 6 +- example/basic.cpp | 94 ++++---- example/defop.cpp | 54 ++--- mshadow/base.h | 13 +- mshadow/expr_engine-inl.h | 46 +++- mshadow/expression-inl.h | 36 +-- mshadow/expression.h | 28 ++- mshadow/sse-inl.h | 435 +++++++++++++++++++++++++++++++++++++ mshadow/tensor.h | 18 +- mshadow/tensor_cpu-inl.h | 206 ++++++++++++++++++ mshadow/tensor_cpu-inl.hpp | 177 --------------- mshadow/tensor_sse-inl.hpp | 431 ------------------------------------ 13 files changed, 810 insertions(+), 736 deletions(-) create mode 100644 mshadow/sse-inl.h create mode 100644 mshadow/tensor_cpu-inl.h delete mode 100644 mshadow/tensor_cpu-inl.hpp delete mode 100644 mshadow/tensor_sse-inl.hpp diff --git a/doc/Doxyfile b/doc/Doxyfile index bef8089a3021..38bd831fa338 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -101,7 +101,7 @@ FILE_PATTERNS = RECURSIVE = NO EXCLUDE = EXCLUDE_SYMLINKS = NO -EXCLUDE_PATTERNS = *-inl.hpp +EXCLUDE_PATTERNS = *-inl.* EXCLUDE_SYMBOLS = mshadow::expr::Plan* mshadow::expr::*Engine* EXAMPLE_PATH = EXAMPLE_PATTERNS = diff --git a/example/Makefile b/example/Makefile index 22652c96fa7e..836b192f90d3 100644 --- a/example/Makefile +++ b/example/Makefile @@ -1,9 +1,9 @@ # set LD_LIBRARY_PATH export CC = gcc -export CXX = g++ +export CXX = clang++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -I/opt/intel/mkl/include -I/usr/local/cuda/include/ -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64 -L/usr/local/cuda/lib64 -export LDFLAGS= -lm -lcudart -lcublas -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ +export LDFLAGS= -lm export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) # specify tensor path diff --git a/example/basic.cpp b/example/basic.cpp index 5d3d9964a9f4..76cb4e8e3f32 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -5,53 +5,53 @@ using namespace mshadow; // this namespace contains all operator overloads using namespace mshadow::expr; -int main( void ){ - // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); - // assume we have a float space - real_t data[ 20 ]; - // create a 2 x 5 x 2 tensor, from existing space - Tensor ts( data, Shape3(2,5,2) ); - // take first subscript of the tensor - Tensor mat = ts[0]; - // Tensor object is only a handle, assignment means they have same data content - Tensor mat2 = mat; - - // shape of matrix, note shape order is different from numpy - // shape[i] indicate the shape of i-th dimension - printf("%u X %u matrix\n", mat.shape[1], mat.shape[0] ); - - // initialize all element to zero - mat = 0.0f; - // assign some values - mat[0][1] = 1.0f; mat[1][0] = 2.0f; - // elementwise operations - mat += ( mat + 10.0f ) / 10.0f + 2.0f; - - // print out matrix, note: mat2 and mat1 are handles(pointers) - for( index_t i = 0; i < mat.shape[1]; i ++ ){ - for( index_t j = 0; j < mat.shape[0]; j ++ ){ - printf("%.2f ", mat2[i][j]); - } - printf("\n"); +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + //InitTensorEngine(); + // assume we have a float space + double data[20]; + // create a 2 x 5 x 2 tensor, from existing space + Tensor ts(data, Shape3(2,5,2)); + // take first subscript of the tensor + Tensor mat = ts[0]; + // Tensor object is only a handle, assignment means they have same data content + Tensor mat2 = mat; + + // shape of matrix, note shape order is different from numpy + // shape[i] indicate the shape of i-th dimension + printf("%u X %u matrix, stride=%u\n", mat.size(0), mat.size(1), mat.stride_); + + // initialize all element to zero + mat = 0.0f; + // assign some values + mat[0][1] = 1.0f; mat[1][0] = 2.0f; + // elementwise operations + mat = mat + 2.0f; + + // print out matrix, note: mat2 and mat1 are handles(pointers) + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { + printf("%.2f ", mat[i][j]); } - - // create a tensor without explictly allocating spaces. - Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); - Tensor mat4 = NewTensor(Shape2(2, 5), 1.0f); - // transpose, and then add mat4. - mat3 = mat.T() + mat4; - - // index the shape using size(), this is more natural for MATLAB/numpy user. - printf("%u X %u matrix\n", mat3.size(0), mat3.size(1) ); - // print out matrix - for( index_t i = 0; i < mat3.size(0); i ++ ){ - for( index_t j = 0; j < mat3.size(1); j ++ ){ - printf("%.2f ", mat3[i][j]); - } - printf("\n"); + printf("\n"); + } + + // create a tensor without explictly allocating spaces. + Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); + Tensor mat4 = NewTensor(Shape2(2, 5), 1.0f); + // transpose, and then add mat4. + mat3 = tcast(mat.T()) + mat4; + + // index the shape using size(), this is more natural for MATLAB/numpy user. + printf("%u X %u matrix\n", mat3.size(0), mat3.size(1)); + // print out matrix + for (index_t i = 0; i < mat3.size(0); ++i) { + for (index_t j = 0; j < mat3.size(1); ++j) { + printf("%.2f ", mat3[i][j]); } - // shutdown tensor enigne after usage - ShutdownTensorEngine(); - return 0; + printf("\n"); + } + // shutdown tensor enigne after usage + //ShutdownTensorEngine(); + return 0; } diff --git a/example/defop.cpp b/example/defop.cpp index 990f4728bed7..e1b1b311bb9a 100644 --- a/example/defop.cpp +++ b/example/defop.cpp @@ -8,37 +8,39 @@ using namespace mshadow::expr; // user defined unary operator addone struct addone{ - MSHADOW_XINLINE static real_t Map(real_t a) { - return a + 1.0f; - } + template + MSHADOW_XINLINE static DType Map(DType a) { + return a + static_cast(1); + } }; // user defined binary operator max of two struct maxoftwo{ - MSHADOW_XINLINE static real_t Map(real_t a,real_t b) { - if( a > b ) return a; - else return b; - } + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + if(a > b) return a; + else return b; + } }; -int main( void ){ - // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); - // take first subscript of the tensor - Tensor mat = NewTensor( Shape2(2,3), 0.0f ); - Tensor mat2= NewTensor( Shape2(2,3), 0.0f ); - - mat[0][0] = -2.0f; - mat = F( F( mat ) + 1.0f, mat2 ); - - for( index_t i = 0; i < mat.shape[1]; i ++ ){ - for( index_t j = 0; j < mat.shape[0]; j ++ ){ - printf("%.2f ", mat[i][j]); - } - printf("\n"); +int main(void){ + // intialize tensor engine before using tensor operation, needed for CuBLAS + //InitTensorEngine(); + // take first subscript of the tensor + Tensor mat = NewTensor(Shape2(2,3), 0.0f); + Tensor mat2= NewTensor(Shape2(2,3), 0.0f); + + mat[0][0] = -2.0f; + mat = F(F(mat) + 1.0f, mat2); + + for(index_t i = 0; i < mat.size(0); ++i){ + for(index_t j = 0; j < mat.size(1); ++j){ + printf("%.2f ", mat[i][j]); } + printf("\n"); + } - FreeSpace( mat ); FreeSpace( mat2 ); - // shutdown tensor enigne after usage - ShutdownTensorEngine(); - return 0; + FreeSpace(&mat); FreeSpace(&mat2); + // shutdown tensor enigne after usage + //ShutdownTensorEngine(); + return 0; } diff --git a/mshadow/base.h b/mshadow/base.h index 71f7a0b8382b..03498159f6e6 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -24,10 +24,13 @@ #ifndef MSHADOW_ALLOC_PAD #define MSHADOW_ALLOC_PAD true #endif -/*! - * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation - * for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64 - * set it to 0 then we will always allocate padded memory +/*! + * \brief + * x dimension of data must be bigger pad_size * ratio to be alloced padded memory, + * otherwise use tide allocation + * for example, if pad_ratio=2, GPU memory alignement size is 32, + * then we will only allocate padded memory if x dimension > 64 + * set it to 0 then we will always allocate padded memory */ #ifndef MSHADOW_MIN_PAD_RATIO #define MSHADOW_MIN_PAD_RATIO 2 @@ -60,7 +63,7 @@ #endif /*! \brief whether use SSE */ #ifndef MSHADOW_USE_SSE - #define MSHADOW_USE_SSE 0 + #define MSHADOW_USE_SSE 1 #endif /*! \brief whether use NVML to get dynamic info */ #ifndef MSHADOW_USE_NVML diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 5448b6774b22..733b7682f1ee 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -49,8 +49,8 @@ class Plan { template class Plan, DType> { public: - explicit Plan(const Tensor &t) - : dptr_(t.dptr), stride_(t.shape.stride_) {} + explicit Plan(const Tensor &t) + : dptr_(t.dptr_), stride_(t.stride_) {} // for RValue, the return type should be reference MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { return dptr_[y * stride_ + x]; @@ -68,7 +68,7 @@ class Plan, DType> { template class Plan, DType> { public: - explicit Plan(const Tensor &t) : dptr_(t.dptr) {} + explicit Plan(const Tensor &t) : dptr_(t.dptr_) {} MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { return dptr_[x]; } @@ -91,6 +91,19 @@ class Plan, DType> { private: DType scalar_; }; +// unary expression +template +class Plan, DstDType> { + public: + explicit Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DstDType Eval(index_t y, index_t x) const { + return static_cast(src_.Eval(y, x)); + } + + private: + Plan src_; +}; // binary expression template class Plan, DType> { @@ -144,7 +157,7 @@ class Plan, DType> { //---------------------------------------------------------------------- // Mappings from expression to plans //--------------------------------------------------------------------- -template +template inline Plan, DType> MakePlan(const BinaryMapExp &e); @@ -153,6 +166,12 @@ inline Plan, DType> MakePlan(const ScalarExp &e) { return Plan, DType>(e.scalar_); } +template +inline Plan, DstDType> +MakePlan(const TypecastExp &e) { + return Plan, DstDType>(MakePlan(e.exp)); +} + template inline Plan MakePlan(const RValueExp &e) { return Plan(e.self()); @@ -196,7 +215,6 @@ template struct ExpInfo { static const int kDim = -1; static const int kDevMask = 0; - static const int kTypeMask = 0; }; template struct ExpInfo< ScalarExp > { @@ -208,6 +226,11 @@ struct ExpInfo > { static const int kDim = ExpInfo::kDim; static const int kDevMask = ExpInfo::kDevMask; }; +template +struct ExpInfo > { + static const int kDim = ExpInfo::kDim; + static const int kDevMask = ExpInfo::kDevMask; +}; template struct ExpInfo > { static const int kDim = dim; @@ -279,6 +302,13 @@ struct ShapeCheck > { return shape; } }; +template +struct ShapeCheck > { + inline static Shape + Check(const TypecastExp &exp) { + return ShapeCheck::Check(exp.exp); + } +}; template struct ShapeCheck > { inline static Shape Check(const TransposeExp &e) { @@ -340,17 +370,17 @@ struct ExpEngine > { template inline static void Eval(Tensor *dst, const Exp &exp) { - MapExp(*dst, exp); + MapExp(dst, exp); } template inline static void Eval(Tensor *dst, const Exp &exp) { - MapExp(*dst, exp); + MapExp(dst, exp); } template inline static void Eval(Tensor *dst, const Exp &exp) { - MapExp(*dst, exp); + MapExp(dst, exp); } template inline static void Eval(Tensor *dst, diff --git a/mshadow/expression-inl.h b/mshadow/expression-inl.h index 283bc96d2676..77ebe6f69fdb 100644 --- a/mshadow/expression-inl.h +++ b/mshadow/expression-inl.h @@ -31,15 +31,15 @@ operator*(MSHADOW_SCALAR_ lhs, template inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> -F(const Exp &lhs, MSHADOW_SCALAR_ rhs) { - return MakeExp(lhs, scalar(rhs)); +F(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); } /*! \brief operator overload for const */ template inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -F(MSHADOW_SCALAR_ lhs, const Exp &rhs) { - return MakeExp(scalar(lhs), rhs); +F(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); } // constant operators /*! \brief operator overload */ @@ -47,16 +47,16 @@ template inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> operator+(const Exp &lhs, - const MSHADOW_SCALAR_ &rhs) { - return MakeExp(lhs, scalar(rhs)); + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); } /*! \brief operator overload */ template inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> operator-(const Exp &lhs, - const MSHADOW_SCALAR_ &rhs) { - return MakeExp(lhs, scalar(rhs)); + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); } /*! \brief operator overload */ template @@ -71,40 +71,40 @@ template inline BinaryMapExp, MSHADOW_SCALAR_, (ta|type::kMapper)> operator/(const Exp &lhs, - const MSHADOW_SCALAR_ &rhs) { - return MakeExp(lhs, scalar(rhs)); + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); } // constant operators 2 /*! \brief operator overload */ template inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator+(MSHADOW_SCALAR_ lhs, +operator+(const ScalarExp &lhs, const Exp &rhs) { - return MakeExp(scalar(lhs), rhs); + return MakeExp(lhs, rhs); } /*! \brief operator overload */ template inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator-(MSHADOW_SCALAR_ lhs, +operator-(const ScalarExp &lhs, const Exp &rhs) { - return MakeExp(scalar(lhs), rhs); + return MakeExp(lhs, rhs); } /*! \brief operator overload */ template inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator*(MSHADOW_SCALAR_ lhs, +operator*(const ScalarExp &lhs, const Exp &rhs) { - return MakeExp(scalar(lhs), rhs); + return MakeExp(lhs, rhs); } /*! \brief operator overload */ template inline BinaryMapExp, TB, MSHADOW_SCALAR_, (tb|type::kMapper)> -operator/(MSHADOW_SCALAR_ lhs, const Exp &rhs) { - return MakeExp(scalar(lhs), rhs); +operator/(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); } } // namespace expr } // namespace mshadow diff --git a/mshadow/expression.h b/mshadow/expression.h index 902b5d002017..fe8f50773b87 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -78,31 +78,36 @@ template struct ScalarExp: public Exp, DType, type::kMapper> { /*! \brief scalar value */ DType scalar_; - /*! \brief constructor */ - explicit ScalarExp(DType scalar) : scalar_(scalar) {} + /*! \brief implicit constructor, MUST NOT BE explicit */ + ScalarExp(DType scalar) : scalar_(scalar) {} }; /*! \brief create an scalar expression */ template -inline ScalarExp scalar(const DType &s) { +inline ScalarExp scalar(DType s) { return ScalarExp(s); } /*! * \brief typecast expression, cast the type of elements - * \tparam DType the target type we want to cast into + * \tparam DstDType the target type we want to cast into + * \tparam SrcDType the target type we want to cast from * \tparam EType the type of the source expression * \tparam etype the type of expression after cast */ -template -struct TypecastExp: public Exp, DType, etype> { +template +struct TypecastExp: + public Exp, + DstDType, etype> { const EType &exp; /*! \brief constructor */ explicit TypecastExp(const EType &e) : exp(e) {} }; /*! \brief create an scalar expression */ -template -inline TypecastExp -tcast(const Exp &exp) { - return TypecastExp(exp.self()); +template +inline TypecastExp +tcast(const Exp &exp) { + return TypecastExp(exp.self()); } /*! \brief represent a transpose expression of a container */ template @@ -351,8 +356,7 @@ F(const Exp &src) { } } // namespace expr } // namespace mshadow - - +// add definition of scalar related operators #ifdef MSAHDOW_SCALAR_ #error "MSHADOW_SCALAR_ must not be defined" #endif diff --git a/mshadow/sse-inl.h b/mshadow/sse-inl.h new file mode 100644 index 000000000000..ea5392291fd1 --- /dev/null +++ b/mshadow/sse-inl.h @@ -0,0 +1,435 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file sse-inl.h + * \brief support of sse2 optimization of some operations + * \author Tianqi Chen + */ +#ifndef MSHADOW_SSE_INL_H_ +#define MSHADOW_SSE_INL_H_ +#ifdef __APPLE__ +#include +#else +#include +#endif +#include "./expression.h" +#include "./tensor.h" + +namespace mshadow { +/*! \brief namespace to support sse2 vectorization */ +namespace sse2 { +/*! + * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells + * \param out_pitch output parameter, the actuall space allocated for each line + * \param lspace number of cells required for each line + * \param num_line number of lines to be allocated + */ +inline void* AlignedMallocPitch(size_t *out_pitch, + size_t lspace, size_t num_line) { + size_t pitch = ((lspace+15) >> 4) << 4; + *out_pitch = pitch; +#ifdef _MSC_VER + void *res = _aligned_malloc(pitch * num_line, 16); +#else +#ifdef __APPLE__ + void *res = malloc(pitch * num_line); +#else + void *res = memalign(16, pitch * num_line); +#endif +#endif + utils::Assert(res != NULL, "AlignedMallocPitch failed"); + return res; +} +/*! + * \brief free aligned space + * \param ptr pointer to space to be freed + */ +inline void AlignedFree(void *ptr) { +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif +} +/*! \brief check if a pointer is aligned */ +inline bool CheckAlign(size_t pitch) { + return !(pitch & ((1 << 4) - 1)); +} +/*! \brief check if a pointer is aligned */ +inline bool CheckAlign(void *ptr) { + return CheckAlign(reinterpret_cast(ptr)); +} +/*! + * \brief get upper bound of aligned index of size + * \param size size of the array + * \param fsize size of float + */ +inline index_t UpperAlign(index_t size, size_t fsize) { + return (((size * fsize + 15) >> 4) << 4) / fsize; +} +/*! + * \brief get lower bound of aligned index of size + * \param size size of the array + * \param fsize size of float + */ +inline index_t LowerAlign(index_t size, size_t fsize) { + return (((size * fsize) >> 4) << 4) / fsize; +} +} // namespace sse2 +} // namespace mshadow +#if MSHADOW_USE_SSE +// sse types are not compatible with nvcc, only use them in cpu mode +#include + +namespace mshadow { +namespace sse2 { +/*! + * \brief float vector real type, used for vectorization + * \tparam FloatType double or float + */ +template +struct FVec { + // whether the vectorization is enabled + static const bool kEnabled = false; +}; +/*! \brief vector real type for float */ +template<> +struct FVec { + // type + typedef __m128 DType; + // whether the vectorization is enabled + static const bool kEnabled = true; + /*! \brief number of float in vector */ + static const index_t kSize = 4; + /*! \brief data content */ + DType data_; + // functions + /* constructors */ + FVec(void) {} + explicit FVec(DType data) : data_(data) {} + /* set the float */ + explicit FVec(const float &s) { + data_ = _mm_set1_ps(s); + } + /*!\brief load from pointer src */ + explicit FVec(const float *src) { + data_ = _mm_load_ps(src); + } + /*! \brief store data into dst space */ + inline void Store(float *dst) const { + return _mm_store_ps(dst, data_); + } + /*! \brief sum of all content */ + inline float Sum(void) const { + DType ans = _mm_add_ps(data_, _mm_movehl_ps(data_, data_)); + DType rst = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1)); +#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) + return rst.m128_f32[0]; +#else + float rr = _mm_cvtss_f32(rst); + return rr; +#endif + } +}; +/*! \brief vector real type for float */ +template<> +struct FVec { + // data type + typedef __m128d DType; + // whether the vectorization is enabled + static const bool kEnabled = true; + /*! \brief number of float in vector */ + static const index_t kSize = 2; + /*! \brief data content */ + DType data_; + /* constructors */ + FVec(void) {} + explicit FVec(DType data) : data_(data) {} + /* set the float */ + explicit FVec(const double &s) { + data_ = _mm_set1_pd(s); + } + /*!\brief load from pointer src */ + explicit FVec(const double *src) { + data_ = _mm_load_pd(src); + } + /*! \brief store data into dst space */ + inline void Store(double *dst) const { + return _mm_store_pd(dst, data_); + } + /*! \brief sum of all content */ + inline double Sum(void) const { + DType tmp = _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_)); +#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) + return tmp.m128d_f64[0]; +#else + double ans = _mm_cvtsd_f64(tmp); + return ans; +#endif + } +}; +/*! \brief sse2 operator type of certain operator */ +template +struct SSEOp{ + static const bool kEnabled = false; +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_add_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_add_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_sub_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_sub_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_mul_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_mul_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_div_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_div_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec Map(const FVec &src) { + return src; + } + MSHADOW_CINLINE static FVec Map(const FVec &src) { + return src; + } +}; +// savers to do storage +template +struct Saver{ + MSHADOW_CINLINE static void Save(TFloat *dst, const FVec &src) { + FVec lhs(dst); + FVec ans = SSEOp::Map(lhs, src); + ans.Store(dst); + } +}; +template +struct Saver { + MSHADOW_CINLINE static void Save(TFloat *dst, const FVec &src) { + src.Store(dst); + } +}; +} // namespace sse2 +namespace expr { +// same as plan, but use sse2 +template +class SSEPlan { + public: + /*! + * \brief evaluate the expression at index [y][x], x will be aligned to 4 + * to be implemented by SubType + */ + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const; + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const; +}; +template +class SSEPlan, DType> { + public: + explicit SSEPlan(const Tensor &t) + :dptr_(t.dptr_), stride_(t.stride_) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::FVec(&dptr_[y * stride_ + x]); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return dptr_[y * stride_ + x]; + } + + private: + const DType *dptr_; + index_t stride_; +}; +template +class SSEPlan, DType> { + public: + explicit SSEPlan(DType scalar) : scalar_(scalar) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::FVec(scalar_); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return scalar_; + } + + private: + DType scalar_; +}; +template +class SSEPlan, DType> { + public: + SSEPlan(const SSEPlan &lhs, const SSEPlan &rhs) + : lhs_(lhs), rhs_(rhs) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::SSEOp::Map(lhs_.EvalSSE(y, x), rhs_.EvalSSE(y, x)); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); + } + + private: + SSEPlan lhs_; + SSEPlan rhs_; +}; + +template +class SSEPlan, DType> { + public: + SSEPlan(const SSEPlan &src) : src_(src) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::SSEOp::Map(src_.EvalSSE(y, x)); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(src_.Eval(y, x)); + } + + private: + SSEPlan src_; +}; + +template +inline SSEPlan, DType> +MakeSSEPlan(const BinaryMapExp &e); + +template +inline SSEPlan, DType> MakeSSEPlan(const ScalarExp &e) { + return SSEPlan, DType>(e.scalar_); +} +template +inline SSEPlan MakeSSEPlan(const RValueExp &e) { + return SSEPlan(e.self()); +} +template +inline SSEPlan +MakeSSEPlan(const MakeTensorExp &e) { + return SSEPlan(e.real_self()); +} +template +inline SSEPlan, DType> +MakeSSEPlan(const UnaryMapExp &e) { + return SSEPlan, DType>(MakeSSEPlan(e.src_)); +} +template +inline SSEPlan, DType> +MakeSSEPlan(const BinaryMapExp &e) { + return SSEPlan, + DType>(MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_)); +} +/*! + * \brief static check sse enable + * if a expression E can not be evaluated using sse, then kPass = false + * \tparam Device the type of Device + * \tparam dim dimension of the tensor + * \tparam E expression + */ +template +struct SSECheck{ + static const bool kPass = false; +}; +template +struct SSECheck > { + static const bool kPass = sse2::FVec::kEnabled; +}; +template +struct SSECheck > { + static const bool kPass = sse2::FVec::kEnabled; +}; +template +struct SSECheck > { + static const bool kPass = SSECheck::kPass && sse2::SSEOp::kEnabled; +}; +template +struct SSECheck< BinaryMapExp > { + static const bool kPass = SSECheck::kPass && + SSECheck::kPass && sse2::SSEOp::kEnabled; +}; +//------------------------------------------------- +// Check if data is aligned and allow sse operation +//------------------------------------------------- +template +struct SSEAlignCheck { + inline static bool Check(const E &exp) { + return false; + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const ScalarExp &exp) { + return true; + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const Tensor &t) { + return sse2::CheckAlign(t.dptr_) && + sse2::CheckAlign(t.stride_ * sizeof(DType)); + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const UnaryMapExp &t) { + return SSEAlignCheck::Check(t.src_); + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const BinaryMapExp &t) { + return SSEAlignCheck::Check(t.lhs_) && + SSEAlignCheck::Check(t.rhs_); + } +}; +/*! + * \brief use SSEPlan to compute result + */ +template +inline void MapSSEPlan(Tensor _dst, + const expr::SSEPlan &plan) { + Tensor dst = _dst.FlatTo2D(); + const index_t xlen = sse2::LowerAlign(dst.size(1), sizeof(DType)); + for (index_t y = 0; y < dst.size(0); ++y) { + for (index_t x = 0; x < xlen; x += sse2::FVec::kSize) { + sse2::Saver::Save(&dst[y][x], plan.EvalSSE(y, x)); + } + for (index_t x = xlen; x < dst.size(1); ++x) { + SV::Save(dst[y][x], plan.Eval(y, x)); + } + } +} +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_USE_SSE +#endif // MSHADOW_SSE_INL_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 80c5b32d3f05..b13307c7b9da 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -120,7 +120,6 @@ v * \return subshape */ MSHADOW_XINLINE Shape SubShape(void) const { Shape s; - s.stride_ = this->stride_; // for cuda #pragma unroll for (int i = 0; i < kSubdim; ++i) { @@ -319,7 +318,7 @@ struct Tensor: public TRValue, */ template struct Tensor: - public expr::RValueExp, DType> { + public TRValue, Device, 1, DType> { public: DType *dptr_; Shape<1> shape_; @@ -330,6 +329,8 @@ struct Tensor: MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape_(shape) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} + MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, index_t stride) + : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} MSHADOW_XINLINE Tensor FlatTo2D(void) const { return Tensor(dptr_, shape_.FlatTo2D(), stride_); } @@ -471,12 +472,12 @@ inline void Softmax(Tensor dst, const Tensor &energy); */ template -inline void MapExp(TRValue dst, +inline void MapExp(TRValue *dst, const expr::Exp &exp); /*! \brief refer to comment of cpu ver \sa MapExp */ template -inline void MapExp(TRValue dst, +inline void MapExp(TRValue *dst, const expr::Exp &exp); /*! * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) @@ -493,13 +494,13 @@ inline void MapExp(TRValue dst, */ template -inline void MapReduceKeepLowest(TRValue dst, +inline void MapReduceKeepLowest(TRValue *dst, const expr::Exp &exp, DType scale = 1); /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ template -inline void MapReduceKeepLowest(TRValue dst, +inline void MapReduceKeepLowest(TRValue *dst, const expr::Exp &exp, DType scale = 1); /*! @@ -518,16 +519,17 @@ inline void MapReduceKeepLowest(TRValue dst, */ template -inline void MapReduceKeepHighDim(TRValue dst, +inline void MapReduceKeepHighDim(TRValue *dst, const expr::Exp &exp, DType scale = 1); /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ template -inline void MapReduceKeepHighDim(TRValue dst, +inline void MapReduceKeepHighDim(TRValue *dst, const expr::Exp &exp, DType scale = 1); } // namespace mshadow // include headers #include "./expr_engine-inl.h" +#include "./tensor_cpu-inl.h" #endif // MSHADOW_TENSOR_H_ diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h new file mode 100644 index 000000000000..3ea32d038b72 --- /dev/null +++ b/mshadow/tensor_cpu-inl.h @@ -0,0 +1,206 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file tensor_cpu-inl.h + * \brief implementation of CPU host code + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_TENSOR_CPU_INL_H_ +#define MSHADOW_TENSOR_CPU_INL_H_ + +#include +#include "./base.h" +#include "./sse-inl.h" + +namespace mshadow { +template +inline void AllocSpace(Tensor *obj, bool pad) { + size_t pitch; + void *dptr; + if (pad) { + dptr = sse2::AlignedMallocPitch + (&pitch, obj->size(dim - 1) * sizeof(DType), obj->shape_.FlatTo2D()[0]); + obj->stride_ = static_cast(pitch / sizeof(DType)); + } else { + obj->stride_ = obj->size(dim - 1); + dptr = sse2::AlignedMallocPitch + (&pitch, obj->shape_.Size() * sizeof(DType), 1); + } + obj->dptr_ = reinterpret_cast(dptr); +} +template +inline Tensor +NewTensor(const Shape &shape, DType initv, bool pad) { + Tensor obj(shape); + AllocSpace(&obj, pad); + MapExp(&obj, expr::ScalarExp(initv)); + return obj; +} +template +inline void FreeSpace(Tensor *obj) { + sse2::AlignedFree(obj->dptr_); + obj->dptr_ = NULL; +} +template +inline void Copy(Tensor _dst, + const Tensor &_src) { + utils::Check(_dst.shape == _src.shape, "Copy:shape mismatch"); + Tensor dst = _dst.FlatTo2D(); + Tensor src = _src.FlatTo2D(); + for (index_t y = 0; y < dst.size(0); ++y) { + memcpy(dst[y].dptr_, src[y].dptr_, sizeof(DType) * dst.size(1)); + } +} +template +inline void MapPlan(TRValue *dst, + const expr::Plan &plan) { + Shape<2> shape = dst->self().shape_.FlatTo2D(); + expr::Plan dplan = expr::MakePlan(dst->self()); + for (index_t y = 0; y < shape[0]; ++y) { + for (index_t x = 0; x < shape[1]; ++x) { + // trust your compiler! -_- they will optimize it + Saver::Save(dplan.Eval(y, x), plan.Eval(y, x)); + } + } +} +// code to handle SSE optimization +template +struct MapExpCPUEngine { + inline static void Map(TRValue *dst, + const expr::Exp &exp) { + MapPlan(dst, MakePlan(exp.self())); + } +}; + +#if MSHADOW_USE_SSE +template +struct MapExpCPUEngine, + dim, DType, E, etype> { + inline static void Map(Tensor *dst, + const expr::Exp &exp) { + if (expr::SSEAlignCheck::Check(exp.self()) && + expr::SSEAlignCheck >::Check(*dst)) { + expr::MapSSEPlan(dst->self(), MakeSSEPlan(exp.self())); + } else { + MapPlan(dst, MakePlan(exp.self())); + } + } +}; +#endif + +template +inline void MapExp(TRValue *dst, + const expr::Exp &exp) { + expr::TypeCheckPass::kMapPass> + ::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); + Shape eshape = expr::ShapeCheck::Check(exp.self()); + utils::Check(eshape[0] == 0 || eshape == dst->self().shape_, + "Assignment: Shape of Tensors are not consistent with target"); +#if MSHADOW_USE_SSE + MapExpCPUEngine::kPass, Saver, R, dim, DType, E, etype> + ::Map(dst->ptrself(), exp); +#else + MapExpCPUEngine::Map(dst, exp); +#endif +} + +template +inline void MapReduceKeepLowest(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + Shape<2> eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()).FlatTo2D(); + utils::Check(eshape[0] == dst->self().size(0), + "reduction dimension do not match"); + utils::Check(eshape[1] != 0, "can not reduce over empty tensor"); + // execution + expr::Plan dplan = MakePlan(dst->self()); + expr::Plan splan = MakePlan(exp.self()); + for (index_t x = 0; x < eshape[1]; ++x) { + DType res = splan.Eval(0, x); + for (index_t y = 1; y < eshape[0]; ++y) { + Reducer::Reduce(res, splan.Eval(y, x)); + } + Saver::Save(dplan.Eval(0, x), res * scale); + } +} + +template +inline void MapReduceKeepHighDim(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + typedef Shape::kDim> EShape; + EShape eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()); + utils::Check(eshape[dimkeep] == dst->self().size(0), + "reduction dimension do not match"); + // use equvalent form + Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), + eshape[dimkeep], + eshape.ProdShape(dimkeep, EShape::kSubdim), + eshape[EShape::kSubdim]); + // execution + expr::Plan dplan = MakePlan(dst->self()); + expr::Plan splan = MakePlan(exp.self()); + for (index_t c = 0; c < pshape[1]; ++c) { + DType res = Reducer::kInitV; + for (index_t n = 0; n < pshape[0]; ++n) { + DType tres = Reducer::kInitV; + for (index_t y = 0; y < pshape[2]; ++y) { + for (index_t x = 0; x < pshape[3]; ++x) { + Reducer::Reduce(tres, + splan.Eval((n * pshape[1] + c) * pshape[2] + y, x)); + } + } + Reducer::Reduce(res, tres); + } + Saver::Save(dplan.Eval(0, c), res * scale); + } +} + +template +inline void Softmax(Tensor dst, + const Tensor &energy) { + DType mmax = energy[0]; + for (index_t x = 1; x < dst.size(0); ++x) { + if (mmax < energy[x]) mmax = energy[x]; + } + DType sum = 0.0f; + for (index_t x = 0; x < dst.size(0); ++x) { + dst[x] = std::exp(energy[x] - mmax); + sum += dst[x]; + } + for (index_t x = 0; x < dst.size(0); ++x) { + dst[x] /= sum; + } +} +template +inline void Softmax(Tensor dst, + const Tensor &energy) { + utils::Check(dst.shape == energy.shape, "Softmax: shape mismatch"); + for (index_t y = 0; y < dst.size(0); ++y) { + Softmax(dst[y], energy[y]); + } +} + +template +inline DType VDot(const Tensor &lhs, + const Tensor &rhs) { + utils::Check(lhs.shape == rhs.shape, "VDot: shape mismatch"); + DType sum = static_cast(0); + for (index_t x = 0; x < lhs.size(0); ++x) { + sum += lhs[x] * rhs[x]; + } + return sum; +} +} // namespace mshadow +#endif // MSHADOW_TENSOR_CPU_INL_H_ diff --git a/mshadow/tensor_cpu-inl.hpp b/mshadow/tensor_cpu-inl.hpp deleted file mode 100644 index 4015afae8709..000000000000 --- a/mshadow/tensor_cpu-inl.hpp +++ /dev/null @@ -1,177 +0,0 @@ -#ifndef MSHADOW_TENSOR_CPU_INL_HPP -#define MSHADOW_TENSOR_CPU_INL_HPP -/*! - * \file tensor_cpu-inl.hpp - * \brief implementation of CPU host code - * \author Bing Xu, Tianqi Chen - */ -#include -#include "tensor_base.h" -#include "tensor_sse-inl.hpp" - -namespace mshadow { - template - inline void AllocSpace(Tensor &obj, bool pad ){ - size_t pitch; - if( pad ){ - obj.dptr = (real_t*)sse2::AlignedMallocPitch - ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); - obj.shape.stride_ = static_cast( pitch / sizeof(real_t) ); - }else{ - obj.shape.stride_ = obj.shape[0]; - obj.dptr = (real_t*)sse2::AlignedMallocPitch - ( pitch, obj.shape.Size() * sizeof(real_t), 1 ); - } - } - - template - inline Tensor NewTensor(const Shape &shape, real_t initv, bool pad ){ - Tensor obj( shape ); - AllocSpace( obj, pad ); - MapExp( obj, expr::ScalarExp( initv ) ); - return obj; - } - - template - inline void FreeSpace(Tensor &obj){ - sse2::AlignedFree( obj.dptr ); - obj.dptr = NULL; - } - - template - inline void Copy(Tensor _dst, const Tensor &_src ){ - utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); - Tensor dst = _dst.FlatTo2D(); - Tensor src = _src.FlatTo2D(); - for (index_t y = 0; y < dst.shape[1]; ++y ) { - memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] ); - } - } - - template - inline void MapPlan(Tensor _dst, const expr::Plan &plan){ - Tensor dst = _dst.FlatTo2D(); - for (index_t y = 0; y < dst.shape[1]; ++y ) { - for (index_t x = 0; x < dst.shape[0]; ++x ) { - // trust your compiler! -_- they will optimize it - Saver::Save(dst[y][x], plan.Eval( y, x ) ); - } - } - } - - // code to handle SSE optimization - template - struct MapExpCPUEngine; - template - struct MapExpCPUEngine{ - inline static void Map(Tensor dst, const expr::Exp &exp ){ - MapPlan( dst, MakePlan( exp.self() ) ); - } - }; - - #if MSHADOW_USE_SSE - template - struct MapExpCPUEngine{ - inline static void Map(Tensor dst, const expr::Exp &exp ){ - using namespace expr; - if( SSEAlignCheck::Check( exp.self() ) && SSEAlignCheck< dim,Tensor >::Check(dst) ){ - MapSSEPlan( dst, MakeSSEPlan( exp.self() ) ); - }else{ - MapPlan( dst, MakePlan( exp.self() ) ); - } - } - }; - #endif - - template - inline void MapExp(Tensor dst, const expr::Exp &exp ){ - using namespace expr; - TypeCheckPass< TypeCheck::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); - Shape eshape = ShapeCheck::Check( exp.self() ); - utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" ); - #if MSHADOW_USE_SSE - MapExpCPUEngine< SSECheck::kPass,Saver,dim,E,etype >::Map( dst, exp ); - #else - MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp ); - #endif - } - - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - Shape<2> eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ).FlatTo2D(); - - utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" ); - utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" ); - // execution - expr::Plan plan = MakePlan( exp.self() ); - for( index_t x = 0; x < eshape[0]; ++x ){ - real_t res = plan.Eval( 0, x ); - for( index_t y = 1; y < eshape[1]; ++y ){ - Reducer::Reduce( res, plan.Eval( y, x ) ); - } - Saver::Save( dst[x], res*scale ); - } - } - - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - typedef Shape< ExpInfo::kDim > EShape; - EShape eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ); - utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" ); - // use equvalent form - Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], - eshape.ProdShape(1,dimkeep), eshape[0] ); - - // execution - expr::Plan plan = MakePlan( exp.self() ); - - for( index_t c = 0; c < pshape[2]; ++c ){ - real_t res = Reducer::kInitV; - for( index_t n = 0; n < pshape[3]; ++n ){ - real_t tres = Reducer::kInitV; - for( index_t y = 0; y < pshape[1]; ++y ){ - for( index_t x = 0; x < pshape[0]; ++x ){ - Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) ); - } - } - Reducer::Reduce( res, tres ); - } - Saver::Save( dst[c], res*scale ); - } - } - - inline void Softmax( Tensor dst, const Tensor& energy ){ - real_t mmax = energy[0]; - for( real_t x = 1; x < dst.shape[0]; ++x ) - if( mmax < energy[x] ) mmax = energy[x]; - real_t sum = 0.0f; - for( index_t x = 0; x < dst.shape[0]; ++x ){ - dst[x] = std::exp( energy[x] - mmax ); - sum += dst[x]; - } - for( index_t x = 0; x < dst.shape[0]; ++x ){ - dst[x] /= sum; - } - } - inline void Softmax( Tensor dst, const Tensor& energy ){ - utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" ); - for( index_t y = 0; y < dst.shape[1]; ++y ){ - Softmax( dst[y], energy[y] ); - } - } - - inline real_t VDot( const Tensor& lhs, const Tensor& rhs ){ - utils::Assert( lhs.shape == rhs.shape, "VDot: shape mismatch" ); - real_t sum = 0.0f; - for( index_t x = 0; x < lhs.shape[0]; ++x ){ - sum += lhs[x] * rhs[x]; - } - return sum; - } -}; // namespace mshadow - -#endif // TENSOR_CPU_INL_HPP diff --git a/mshadow/tensor_sse-inl.hpp b/mshadow/tensor_sse-inl.hpp deleted file mode 100644 index b98383e83d6a..000000000000 --- a/mshadow/tensor_sse-inl.hpp +++ /dev/null @@ -1,431 +0,0 @@ -#ifndef MSHADOW_TENSOR_SSE_INL_HPP -#define MSHADOW_TENSOR_SSE_INL_HPP -/*! - * \file tensor_sse-inl.hpp - * \brief support of sse2 optimization of some operations - * \author Tianqi Chen - */ -#ifdef __APPLE__ -#include -#else -#include -#endif - -#include "tensor_expr.h" -#include "tensor.h" - -namespace mshadow { - /*! \brief namespace to support sse2 vectorization */ - namespace sse2{ - /*! - * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells - * \param pitch output parameter, the actuall space allocated for each line - * \param lspace number of cells required for each line - * \param num_line number of lines to be allocated - */ - inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){ - pitch = ((lspace+15) >> 4) << 4; - #ifdef _MSC_VER - void * res = _aligned_malloc( pitch*num_line, 16 ); - #else - #ifdef __APPLE__ - void *res = malloc( pitch * num_line ); - #else - void * res = memalign( 16, pitch*num_line ); - #endif - #endif - utils::Assert( res != NULL, "AlignedMallocPitch failed" ); - return res; - } - /*! - * \brief free aligned space - * \param ptr pointer to space to be freed - */ - inline void AlignedFree( void *ptr ){ - #ifdef _MSC_VER - _aligned_free( ptr ); - #else - free( ptr ); - #endif - } - /*! \brief check if a pointer is aligned */ - inline bool CheckAlign( size_t pitch ){ - return !(pitch & ((1<<4)-1)); - } - /*! \brief check if a pointer is aligned */ - inline bool CheckAlign( void *ptr ){ - return CheckAlign( (size_t)ptr ); - } - /*! - * \brief get upper bound of aligned index of size - * \param size size of the array - * \param fsize size of float - */ - inline index_t UpperAlign( index_t size, size_t fsize ){ - return (( (size*fsize+15) >> 4 ) << 4) / fsize; - } - /*! - * \brief get lower bound of aligned index of size - * \param size size of the array - * \param fsize size of float - */ - inline index_t LowerAlign( index_t size, size_t fsize ){ - return (( (size*fsize) >> 4 ) << 4) / fsize; - } - }; // namespace sse2 -}; // namespace mshadow - -#if MSHADOW_USE_SSE -// sse types are not compatible with nvcc, only use them in cpu mode -#include - -namespace mshadow{ - namespace sse2{ - /*! - * \brief float vector real type, used for vectorization - * \tparam FloatType double or float - */ - template struct FVec{}; - - /*! \brief vector real type for float */ - template<> - struct FVec { - public: - typedef __m128 DType; - /*! \brief number of float in vector */ - const static index_t kSize = 4; - /*! \brief data content */ - DType data_; - public: - /* constructors */ - FVec( void ){} - FVec( DType data ):data_(data){} - /* set the float */ - FVec( const float &s ){ - data_ = _mm_set1_ps( s ); - } - /*!\brief load from pointer src */ - FVec( const float *src ){ - data_ = _mm_load_ps( src ); - } - public: - /*! \brief store data into dst space */ - inline void Store( float *dst ) const{ - return _mm_store_ps( dst, data_ ); - } - /*! \brief sum of all content */ - inline float Sum( void ) const{ - DType ans = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) ); - DType rst = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) ); - #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64) - return rst.m128_f32[ 0 ]; - #else - float rr = _mm_cvtss_f32( rst ) ; - return rr; - #endif - } - }; - - /*! \brief vector real type for float */ - template<> - struct FVec { - public: - typedef __m128d DType; - /*! \brief number of float in vector */ - const static index_t kSize = 2; - /*! \brief data content */ - DType data_; - public: - /* constructors */ - FVec( void ){} - FVec( DType data ):data_(data){} - /* set the float */ - FVec( const double &s ){ - data_ = _mm_set1_pd( s ); - } - /*!\brief load from pointer src */ - FVec( const double *src ){ - data_ = _mm_load_pd( src ); - } - public: - /*! \brief store data into dst space */ - inline void Store( double *dst ) const{ - return _mm_store_pd( dst, data_ ); - } - /*! \brief sum of all content */ - inline double Sum( void ) const{ - DType tmp = _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ; - #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64) - return tmp.m128d_f64[0]; - #else - double ans = _mm_cvtsd_f64( tmp ); - return ans; - #endif - } - }; - }; - - namespace sse2{ - /*! \brief sse2 operator type of certain operator */ - template - struct SSEOp{ - const static bool kEnabled = false; - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_add_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_add_pd( lhs.data_, rhs.data_ ) ); - } - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_sub_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_sub_pd( lhs.data_, rhs.data_ ) ); - } - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_mul_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_mul_pd( lhs.data_, rhs.data_ ) ); - } - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_div_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_div_pd( lhs.data_, rhs.data_ ) ); - } - }; - - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &src ){ - return src; - } - MSHADOW_CINLINE static FVec Map( const FVec &src ){ - return src; - } - }; - }; // namespace sse2 - - namespace sse2{ - // savers to do storage - template - struct Saver{ - MSHADOW_CINLINE static void Save( TFloat *dst, const FVec &src ){ - FVec lhs( dst ); - FVec ans = SSEOp::Map( lhs, src ); - ans.Store( dst ); - } - }; - template - struct Saver{ - MSHADOW_CINLINE static void Save( TFloat *dst, const FVec &src ){ - src.Store( dst ); - } - }; - }; // namespace sse2 -}; // namespace mshadow - -namespace mshadow{ - namespace expr{ - // same as plan, but use sse2 - template - class SSEPlan { - public: - /*! - * \brief evaluate the expression at index [y][x], x will be aligned to 4 - * to be implemented by SubType - */ - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const; - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const; - }; - - template - class SSEPlan< Tensor >{ - public: - SSEPlan( const Tensor &t ) - :dptr_(t.dptr),stride_(t.shape.stride_){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::FVec( &dptr_[ y*stride_+x ] ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ y * stride_ + x ]; - } - private: - const real_t *dptr_; - index_t stride_; - }; - - template<> - class SSEPlan{ - public: - SSEPlan( real_t scalar ):scalar_(scalar){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::FVec( scalar_ ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return scalar_; - } - private: - real_t scalar_; - }; - - template - class SSEPlan< BinaryMapExp >{ - public: - SSEPlan( const SSEPlan &lhs, const SSEPlan &rhs ) - :lhs_(lhs), rhs_(rhs){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::SSEOp::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) ); - } - private: - SSEPlan lhs_; - SSEPlan rhs_; - }; - - template - class SSEPlan< UnaryMapExp >{ - public: - SSEPlan( const SSEPlan &src ):src_(src){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::SSEOp::Map( src_.EvalSSE( y, x ) ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( src_.Eval( y, x ) ); - } - private: - SSEPlan src_; - }; - - template - inline SSEPlan< BinaryMapExp > MakeSSEPlan( const BinaryMapExp &e ); - - inline SSEPlan MakeSSEPlan( const ScalarExp &e ){ - return SSEPlan( e.scalar_ ); - } - - template - inline SSEPlan MakeSSEPlan( const ContainerExp &e ){ - return SSEPlan( e.self() ); - } - - template - inline SSEPlan MakeSSEPlan( const MakeTensorExp &e ){ - return SSEPlan( e.real_self() ); - } - - template - inline SSEPlan< UnaryMapExp > MakeSSEPlan( const UnaryMapExp &e ){ - return SSEPlan< UnaryMapExp >( MakeSSEPlan(e.src_) ); - } - - template - inline SSEPlan< BinaryMapExp > MakeSSEPlan( const BinaryMapExp &e ){ - return SSEPlan< BinaryMapExp >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) ); - } - }; - - namespace expr{ - /*! - * \brief static check sse enable - * if a expression E can not be evaluated using sse, then kPass = false - * \tparam Device the type of Device - * \tparam dim dimension of the tensor - * \tparam E expression - */ - template - struct SSECheck{ - const static bool kPass = false; - }; - template<> - struct SSECheck{ - const static bool kPass = true; - }; - template - struct SSECheck >{ - const static bool kPass = true; - }; - - template - struct SSECheck >{ - const static bool kPass = SSECheck::kPass && sse2::SSEOp::kEnabled; - }; - template - struct SSECheck< BinaryMapExp >{ - const static bool kPass = SSECheck::kPass && SSECheck::kPass && sse2::SSEOp::kEnabled; - }; - }; // namespace expr - namespace expr{ - // check if data is aligned and allow sse operation - template - struct SSEAlignCheck{ - inline static bool Check( const E &exp ){ - return false; - } - }; - template - struct SSEAlignCheck< dim, ScalarExp >{ - inline static bool Check( const ScalarExp &exp ){ - return true; - } - }; - template - struct SSEAlignCheck< dim,Tensor >{ - inline static bool Check( const Tensor &t ){ - return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) ); - } - }; - template - struct SSEAlignCheck< dim, UnaryMapExp >{ - inline static bool Check( const UnaryMapExp &t ){ - return SSEAlignCheck::Check( t.src_); - } - }; - template - struct SSEAlignCheck< dim, BinaryMapExp >{ - inline static bool Check( const BinaryMapExp &t ){ - return SSEAlignCheck::Check( t.lhs_ ) && - SSEAlignCheck::Check( t.rhs_ ); - } - }; - }; // namespace expr - - /*! - * \brief use SSEPlan to compute result - */ - template - inline void MapSSEPlan(Tensor _dst, const expr::SSEPlan &plan){ - Tensor dst = _dst.FlatTo2D(); - const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) ); - for ( index_t y = 0; y < dst.shape[1]; y ++ ) { - for( index_t x = 0; x < xlen; x += sse2::FVec::kSize ){ - sse2::Saver::Save( &dst[y][x], plan.EvalSSE( y,x ) ); - } - for( index_t x = xlen; x < dst.shape[0]; x ++ ){ - SV::Save( dst[y][x], plan.Eval(y,x) ); - } - } - } -}; // namespace mshadow -#endif // MSHADOW_USE_SSE -#endif // MSHADOW_TENSOR_SSE_INL_HPP From ab05d68b5ffaa14752ec60c6b85e38b7bedd8e40 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 25 Dec 2014 04:41:22 -0800 Subject: [PATCH 019/147] refactor io module --- mshadow/io.h | 120 ++++++++++++++++++++++++++++++++++++++ mshadow/tensor.h | 1 + mshadow/tensor_io.h | 137 -------------------------------------------- 3 files changed, 121 insertions(+), 137 deletions(-) create mode 100644 mshadow/io.h delete mode 100644 mshadow/tensor_io.h diff --git a/mshadow/io.h b/mshadow/io.h new file mode 100644 index 000000000000..8034609bc1dd --- /dev/null +++ b/mshadow/io.h @@ -0,0 +1,120 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file tensor_io.h + * \brief definitions of I/O functions for mshadow tensor + * \author Tianqi Chen + */ +#ifndef MSHADOW_IO_H_ +#define MSHADOW_IO_H_ +#include "./tensor.h" + +namespace mshadow { +namespace utils { +/*! + * \brief interface of stream I/O, used to serialize data, + * mshadow does not restricted to only this interface in SaveBinary/LoadBinary + * mshadow accept all class that implements Read and Write + */ +class IStream { + public: + /*! + * \brief read data from stream + * \param ptr pointer to memory buffer + * \param size size of block + * \return usually is the size of data readed + */ + virtual size_t Read(void *ptr, size_t size) = 0; + /*! + * \brief write data to stream + * \param ptr pointer to memory buffer + * \param size size of block + */ + virtual void Write(const void *ptr, size_t size) = 0; + /*! \brief virtual destructor */ + virtual ~IStream(void) {} +}; +} // namespace utils +/*! + * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor storage will be allocated + * \param fo output binary stream + * \param src source data file + * \tparam dim dimension of tensor + * \tparam DType type of element in tensor + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ +template +inline void SaveBinary(TStream &fo, const Tensor &src); +/*! \brief refer to comment of cpu ver \sa SaveBinary */ +template +inline void SaveBinary(TStream &fo, const Tensor &src); +/*! + * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor storage will be allocated + * if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded + * if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst + * \param fi output binary stream + * \param dst destination file + * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen + * \tparam dim dimension of tensor + * \tparam DType type of element in tensor + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ +template +inline void LoadBinary(TStream &fi, + Tensor *dst, bool pre_alloc); +/*! \brief refer to comment of cpu ver \sa LoadBinary */ +template +inline void LoadBinary(TStream &fi, + Tensor *dst, bool pre_alloc); + +// implementations +template +inline void SaveBinary(TStream &fo, const Tensor &src_) { + fo.Write(src_.shape_, sizeof(src_.shape_)); + Tensor src = src_.FlatTo2D(); + for (index_t i = 0; i < src.size(0); ++i) { + fo.Write(src[i].dptr_, sizeof(DType) * src.size(1)); + } +} +template +inline void SaveBinary(TStream &fo, const Tensor &src) { + // copy to CPU, then save + Tensor tmp(src.shape_); + AllocSpace(&tmp); + Copy(tmp, src); + SaveBinary(fo, tmp); + FreeSpace(&tmp); +} +template +inline void LoadBinary(TStream &fi, + Tensor *dst_, bool pre_alloc) { + Shape shape; + utils::Check(fi.Read(&shape, sizeof(shape)) != 0, "mshadow::LoadBinary"); + if (pre_alloc) { + utils::Check(shape == dst_->shape_, + "LoadBinary, shape do not match pre-allocated shape"); + } else { + dst_->shape_ = shape; AllocSpace(dst_); + } + Tensor dst = dst_->FlatTo2D(); + if (dst.shape[1] == 0) return; + for (index_t i = 0; i < dst.size(0); ++i) { + utils::Check(fi.Read(dst[i].dptr_, sizeof(DType) * dst.size(1)) != 0, + "mshadow::LoadBinary"); + } +} +template +inline void LoadBinary(TStream &fi, + Tensor *dst, bool pre_alloc) { + Tensor tmp; + LoadBinary(fi, &tmp, false); + if (pre_alloc) { + utils::Check(tmp.shape == dst->shape_, + "LoadBinary, shape do not match pre-allocated shape"); + } else { + dst->shape = tmp.shape; AllocSpace(dst); + } + Copy(*dst, tmp); + FreeSpace(&tmp); +} +} // namespace mshadow +#endif // MSHADOW_IO_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index b13307c7b9da..13f6a57f2aee 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -532,4 +532,5 @@ inline void MapReduceKeepHighDim(TRValue *dst, // include headers #include "./expr_engine-inl.h" #include "./tensor_cpu-inl.h" +#include "./io.h" #endif // MSHADOW_TENSOR_H_ diff --git a/mshadow/tensor_io.h b/mshadow/tensor_io.h deleted file mode 100644 index 2ce28b3a75e6..000000000000 --- a/mshadow/tensor_io.h +++ /dev/null @@ -1,137 +0,0 @@ -#ifndef MSHADOW_TENSOR_IO_H -#define MSHADOW_TENSOR_IO_H -/*! - * \file tensor_io.h - * \brief definitions of I/O functions for mshadow tensor - * \author Tianqi Chen - */ -#include -#include "tensor.h" - -namespace mshadow{ - namespace utils{ - /*! - * \brief interface of stream I/O, used to serialize data, - * it is not restricted to only this interface in SaveBinary/LoadBinary - * mshadow accept all class that implements Read and Write - */ - class IStream{ - public: - /*! - * \brief read data from stream - * \param ptr pointer to memory buffer - * \param size size of block - * \return usually is the size of data readed - */ - virtual size_t Read( void *ptr, size_t size ) = 0; - /*! - * \brief write data to stream - * \param ptr pointer to memory buffer - * \param size size of block - */ - virtual void Write( const void *ptr, size_t size ) = 0; - /*! \brief virtual destructor */ - virtual ~IStream( void ){} - }; - }; - - /*! - * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor storage will be allocated - * \param fo output binary stream - * \param src source data file - * \tparam dim dimension of tensor - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void SaveBinary( TStream &fo, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa SaveBinary */ - template - inline void SaveBinary( TStream &fo, const Tensor &src ); - - /*! - * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor storage will be allocated - * if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded - * if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst - * \param fi output binary stream - * \param dst destination file - * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen - * \tparam dim dimension of tensor - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void LoadBinary( TStream &fi, Tensor &dst, bool pre_alloc ); - /*! \brief refer to comment of cpu ver \sa LoadBinary */ - template - inline void LoadBinary( TStream &fi, Tensor &dst, bool pre_alloc ); - - namespace utils{ - /*! \brief implementation of file i/o stream */ - class FileStream: public IStream{ - public: - /*! \brief constructor */ - FileStream( FILE *fp ):fp_(fp){} - virtual size_t Read( void *ptr, size_t size ){ - return fread( ptr, size, 1, fp_ ); - } - virtual void Write( const void *ptr, size_t size ){ - fwrite( ptr, size, 1, fp_ ); - } - /*! \brief close file */ - inline void Close( void ){ - fclose( fp_ ); - } - private: - FILE *fp_; - }; - }; -}; - -namespace mshadow{ - // implementations - template - inline void SaveBinary( TStream &fo, const Tensor &src_ ){ - fo.Write( src_.shape.shape_, sizeof(index_t) * dim ); - Tensor src = src_.FlatTo2D(); - for( index_t i = 0; i < src.shape[1]; ++ i ){ - fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] ); - } - } - template - inline void SaveBinary( TStream &fo, const Tensor &src ){ - // copy to CPU, then save - Tensor tmp( src.shape ); - AllocSpace( tmp ); - Copy( tmp, src ); - SaveBinary( fo, tmp ); - FreeSpace( tmp ); - } - - template - inline void LoadBinary( TStream &fi, Tensor &dst_, bool pre_alloc ){ - Shape shape; - utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, "mshadow::LoadBinary" ); - if( pre_alloc ){ - utils::Assert( shape == dst_.shape ); - }else{ - dst_.shape = shape; AllocSpace( dst_ ); - } - Tensor dst = dst_.FlatTo2D(); - if( dst.shape[0] == 0 ) return; - for( index_t i = 0; i < dst.shape[1]; ++ i ){ - utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) != 0, "mshadow::LoadBinary" ); - } - } - template - inline void LoadBinary( TStream &fi, Tensor &dst, bool pre_alloc ){ - Tensor tmp; - LoadBinary( fi, tmp, false ); - if( pre_alloc ){ - utils::Assert( tmp.shape == dst.shape ); - }else{ - dst.shape = tmp.shape; AllocSpace( dst ); - } - Copy( dst, tmp ); - FreeSpace( tmp ); - } -}; -#endif // TENSOR_IO_H From d57d4c709e191b76a142123e7859002d6874e805 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 25 Dec 2014 05:25:35 -0800 Subject: [PATCH 020/147] pass tensor container and io --- mshadow/tensor.h | 1 + mshadow/tensor_container.h | 295 +++++++++++++++++++------------------ 2 files changed, 151 insertions(+), 145 deletions(-) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 13f6a57f2aee..99b8dbf44860 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -533,4 +533,5 @@ inline void MapReduceKeepHighDim(TRValue *dst, #include "./expr_engine-inl.h" #include "./tensor_cpu-inl.h" #include "./io.h" +#include "./tensor_container.h" #endif // MSHADOW_TENSOR_H_ diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index f0699e735b0f..1aac6bd26605 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -1,152 +1,157 @@ -#ifndef MSHADOW_TENSOR_CONTAINER_H -#define MSHADOW_TENSOR_CONTAINER_H /*! + * Copyright (c) 2014 by Contributors * \file tensor_container.h * \brief tensor container that does memory allocation and resize like STL * \author Tianqi Chen */ -#include "tensor.h" -#include "tensor_io.h" +#ifndef MSHADOW_TENSOR_CONTAINER_H_ +#define MSHADOW_TENSOR_CONTAINER_H_ +#include "./tensor.h" +#include "./io.h" -namespace mshadow{ - /*! - * \brief tensor container that does memory allocation and resize like STL, - * use it to save the lines of FreeSpace in class. - * Do not abuse it, efficiency can come from pre-allocation and no re-allocation - * - * \tparam Device which device the tensor is on - * \tparam dimension dimension of the tensor - */ - template - class TensorContainer: public Tensor{ - public: - /*! - * \brief constructor - * \param pad whether use padding alignment in space allocation - */ - TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){ - this->pad_ = pad; - this->dptr = data_.dptr = NULL; - this->shape[0] = 0; - this->shape.stride_ = 0; - this->data_.shape.stride_ = 0; - this->data_.shape[1] = 0; - } - /*! - * \brief constructor - * \param shape intial shape - */ - TensorContainer( const Shape &shape ){ - this->pad_ = MSHADOW_ALLOC_PAD; - data_.dptr = NULL; - this->AllocByShape( shape ); - } - /*! - * \brief constructor - * \param shape intial shape - * \param initv intial value - */ - TensorContainer( const Shape &shape, real_t initv ){ - this->pad_ = MSHADOW_ALLOC_PAD; - data_.dptr = NULL; - this->AllocByShape( shape ); - (*this) = initv; - } - ~TensorContainer( void ){ - this->FreeSpace(); - } - /*! - * \brief resize the container to given shape, content is NOT preserved - * \param shape target shape - */ - inline void Resize( const Shape &shape ){ - Shape<2> s2 = shape.FlatTo2D(); - if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){ - this->AllocByShape( shape ); - }else{ - this->shape = shape; - if( this->pad_ ){ - this->shape.stride_ = data_.shape.stride_; - }else{ - this->shape.stride_ = this->shape[ 0 ]; - } - } - } - /*! - * \brief resize the container to given shape, and initialize, content is NOT preserved - * \param shape target shape - * \param initv initialization value - */ - inline void Resize( const Shape &shape, real_t initv ){ - this->Resize( shape ); - (*this) = initv; - } - /*! \brief set whether padding is allowed in tensor */ - inline void set_pad( bool pad ){ - this->pad_ = pad; - } - /*! - * \brief save by binary format - * \param fo output binary stream - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void SaveBinary( TStream &fo ) const{ - mshadow::SaveBinary( fo, *this ); - } - /*! - * \brief load by binary format, a temp Tensor storage will be allocated - * \param fi input binary stream - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void LoadBinary( TStream &fi ) { - Tensor tmp; - mshadow::LoadBinary( fi, tmp, false ); - this->Resize( tmp.shape ); - Copy( *this, tmp ); - mshadow::FreeSpace( tmp ); - } - public: - // functions to fit exp template - inline Tensor& operator=( real_t s ){ - return this->__assign( s ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - private: - /*! \brief whether we do padding in the space */ - bool pad_; - /*! \brief the shape of data_ is actually current data space */ - Tensor data_; - private: - inline void FreeSpace (void){ - if( data_.dptr != NULL ){ - mshadow::FreeSpace( data_ ); - data_.dptr = this->dptr = NULL; - } - } - inline void AllocByShape (const Shape& shape){ - if( data_.dptr != NULL ){ - this->FreeSpace(); - } - data_.shape = shape.FlatTo2D(); - mshadow::AllocSpace( data_, pad_ ); - this->dptr = data_.dptr; - this->shape = shape; - if( this->pad_ ){ - this->shape.stride_ = data_.shape.stride_; - }else{ - this->shape.stride_ = shape[0]; - } - } - }; -};// namespace mshadow +namespace mshadow { +/*! + * \brief tensor container that does memory allocation and resize like STL, + * use it to save the lines of FreeSpace in class. + * Do not abuse it, efficiency can come from pre-allocation and no re-allocation + * + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + */ +template +class TensorContainer: public Tensor { + public: + /*! + * \brief constructor + * \param pad whether use padding alignment in space allocation + */ + explicit TensorContainer(bool pad = MSHADOW_ALLOC_PAD) { + this->pad_ = pad; + this->dptr_ = data_.dptr_ = NULL; + this->shape_[0] = 0; + this->stride_ = 0; + this->data_.stride_ = 0; + this->data_.shape_[0] = 0; + } + /*! + * \brief constructor + * \param shape intial shape + */ + explicit TensorContainer(const Shape &shape) { + this->pad_ = MSHADOW_ALLOC_PAD; + data_.dptr = NULL; + this->AllocByShape(shape); + } + /*! + * \brief constructor + * \param shape intial shape + * \param initv intial value + */ + explicit TensorContainer(const Shape &shape, DType initv) { + this->pad_ = MSHADOW_ALLOC_PAD; + data_.dptr = NULL; + this->AllocByShape(shape); + (*this) = initv; + } + ~TensorContainer(void) { + this->FreeSpace(); + } + /*! + * \brief resize the container to given shape, content is NOT preserved + * \param shape target shape + */ + inline void Resize(const Shape &shape) { + Shape<2> s2 = shape.FlatTo2D(); + if (s2.shape_[1] > data_.stride_ || s2.shape_[0] > data_.size(0)) { + this->AllocByShape(shape); + } else { + this->shape = shape; + if (this->pad_) { + this->stride_ = data_.stride_; + } else { + this->stride_ = s2.shape_[1]; + } + } + } + /*! + * \brief resize the container to given shape, and initialize, content is NOT preserved + * \param shape target shape + * \param initv initialization value + */ + inline void Resize(const Shape &shape, DType initv) { + this->Resize(shape); + (*this) = initv; + } + /*! \brief set whether padding is allowed in tensor */ + inline void set_pad(bool pad) { + this->pad_ = pad; + } + /*! + * \brief save by binary format + * \param fo output binary stream + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template + inline void SaveBinary(TStream &fo) const { + mshadow::SaveBinary(fo, *this); + } + /*! + * \brief load by binary format, a temp Tensor storage will be allocated + * \param fi input binary stream + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template + inline void LoadBinary(TStream &fi) { + Tensor tmp; + mshadow::LoadBinary(fi, &tmp, false); + this->Resize(tmp.shape_); + Copy(*this, tmp); + mshadow::FreeSpace(&tmp); + } + // functions to fit exp template + inline Tensor &operator=(DType s) { + return this->__assign(s); + } + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } -#endif + private: + /*! \brief whether we do padding in the space */ + bool pad_; + /*! \brief the shape of data_ is actually current data space */ + Tensor data_; + // freespace + inline void FreeSpace(void) { + if (data_.dptr_ != NULL) { + mshadow::FreeSpace(data_); + data_.dptr = this->dptr = NULL; + } + } + inline void AllocByShape(const Shape& shape) { + if (data_.dptr_ != NULL) this->FreeSpace(); + data_.shape_ = shape.FlatTo2D(); + mshadow::AllocSpace(data_, pad_); + this->dptr = data_.dptr_; + this->shape_ = shape; + if (this->pad_) { + this->stride_ = data_.stride_; + } else { + this->stride_ = data_.size(1); + } + } +}; +} // namespace mshadow +#endif // MSHADOW_TENSOR_CONTAINER_H_ From 3af673858a4fdb6ee5d0ef595f9f3c06029c8b9e Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 25 Dec 2014 19:12:29 -0800 Subject: [PATCH 021/147] checkin broadcast --- example/basic.cpp | 22 +-- mshadow/base.h | 11 ++ mshadow/expr_engine-inl.h | 4 +- mshadow/extension.h | 13 ++ mshadow/extension/README.md | 1 + mshadow/extension/broadcast.h | 107 +++++++++++ mshadow/tensor.h | 20 +- mshadow/tensor_container.h | 2 +- mshadow/tensor_expr_ext.h | 347 ++++++++++++---------------------- 9 files changed, 280 insertions(+), 247 deletions(-) create mode 100644 mshadow/extension.h create mode 100644 mshadow/extension/README.md create mode 100644 mshadow/extension/broadcast.h diff --git a/example/basic.cpp b/example/basic.cpp index 76cb4e8e3f32..2e7869b0d29a 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -11,34 +11,34 @@ int main(void) { // assume we have a float space double data[20]; // create a 2 x 5 x 2 tensor, from existing space - Tensor ts(data, Shape3(2,5,2)); + Tensor ts(data, Shape3(2,5,2)); // take first subscript of the tensor - Tensor mat = ts[0]; + Tensor mat = ts[0]; // Tensor object is only a handle, assignment means they have same data content - Tensor mat2 = mat; - + Tensor mat2= NewTensor(Shape1(2), 0.0f); + mat2[1] = 10; // shape of matrix, note shape order is different from numpy // shape[i] indicate the shape of i-th dimension printf("%u X %u matrix, stride=%u\n", mat.size(0), mat.size(1), mat.stride_); - // initialize all element to zero - mat = 0.0f; + // assign some values mat[0][1] = 1.0f; mat[1][0] = 2.0f; // elementwise operations - mat = mat + 2.0f; + ts = broadcast<0>(mat2, ts.shape_); // print out matrix, note: mat2 and mat1 are handles(pointers) + for (index_t c = 0; c < ts.size(0); ++c) { for (index_t i = 0; i < mat.size(0); ++i) { for (index_t j = 0; j < mat.size(1); ++j) { - printf("%.2f ", mat[i][j]); + printf("%.2f ", ts[c][i][j]); } printf("\n"); } - + } // create a tensor without explictly allocating spaces. - Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); - Tensor mat4 = NewTensor(Shape2(2, 5), 1.0f); + Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); + Tensor mat4 = NewTensor(Shape2(2, 5), 1.0f); // transpose, and then add mat4. mat3 = tcast(mat.T()) + mat4; diff --git a/mshadow/base.h b/mshadow/base.h index 03498159f6e6..1301f2043fbc 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -114,6 +114,17 @@ extern "C" { #define MSHADOW_CONSTEXPR const #endif +/*! + * \brief default data type for tensor string + * in code release, change it to default_real_t + * during development, change it to empty string so that missing + * template arguments can be detected + */ +#ifndef MSHADOW_DEFAULT_DTYPE +#define MSHADOW_DEFAULT_DTYPE = default_real_t +//#define MSHADOW_DEFAULT_DTYPE +#endif + /*! \brief namespace for mshadow */ namespace mshadow { /*! \brief buffer size for each random number generator */ diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 733b7682f1ee..5b7f4f8d7d13 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -68,7 +68,7 @@ class Plan, DType> { template class Plan, DType> { public: - explicit Plan(const Tensor &t) : dptr_(t.dptr_) {} + explicit Plan(const Tensor &t) : dptr_(t.dptr_) {} MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { return dptr_[x]; } @@ -395,7 +395,7 @@ struct ExpComplexEngine, ltrans, rtrans, DType>, DType> { - inline static void Eval(Tensor *dst, + inline static void Eval(Tensor *dst, const DotExp, Tensor, ltrans, rtrans, DType> &exp) { diff --git a/mshadow/extension.h b/mshadow/extension.h new file mode 100644 index 000000000000..90c1c13291ae --- /dev/null +++ b/mshadow/extension.h @@ -0,0 +1,13 @@ +/*! + * + * \file extension.h + * \brief some extension of expressions, + * used to support something beyond elementwise op + * \author Tianqi Chen, Bing Xu + */ +#ifndef MSHADOW_EXTENSION_H_ +#define MSHADOW_EXTENSION_H_ +#include "./expr_engine-inl.h" +#include "./extension/broadcast.h" +#endif + diff --git a/mshadow/extension/README.md b/mshadow/extension/README.md new file mode 100644 index 000000000000..1edc901915eb --- /dev/null +++ b/mshadow/extension/README.md @@ -0,0 +1 @@ +This folder contains complex expressions that pulls multiple expression together diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h new file mode 100644 index 000000000000..e5e203c6c58c --- /dev/null +++ b/mshadow/extension/broadcast.h @@ -0,0 +1,107 @@ +#ifndef MSHADOW_EXTENSION_BROADCAST_INL_H_ +#define MSHADOW_EXTENSION_BROADCAST_INL_H_ +/*! + * Copyright (c) 2014 by Contributors + * \file broadcast-inl.h + * \brief definitions of abstract expressions and expressions template + * \author Tianqi Chen + */ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief broadcast Tensor1D into a higher dimension Tensor + * input: Tensor: ishape[0] + * output: Tensor : oshape[dimcast] = ishape[0] + * \tparam SrcExp type of input expression + * \tparam DType the type of elements + * \tparam dimdst target tensor dimension + * \tparam dimcast_m_dst dimcast - dimdst + */ +template +struct Broadcast1DExp: + public MakeTensorExp, + SrcExp, dimdst, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief constructor */ + Broadcast1DExp(const SrcExp &src, Shape shape) + : src_(src) { + this->shape_ = shape; + } +}; +/*! + * \brief a expression that replicate a 1 dimension tensor in dimension dimcast + * \param src Tensor: shape[0] + * \param shape shape of output + * \return a expresion with type Tensor + * \tparam dimcast target dimension where the 1D tensor will be broadcasted + * \tparam SrcExp type of input expression + * \tparam DType the type of elements + * \tparam dimdst dimension of destination tensor + * \tparam dimcast_lowest the dimension we want to cast the data into + */ +template +inline Broadcast1DExp +broadcast(const expr::Exp &src, Shape shape) { + TypeCheckPass::kDim == 1> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(ShapeCheck<1, SrcExp>::Check(src.self())[0] == shape[dimcast], + "broadcast, shape mismatch"); + return Broadcast1DExp(src.self(), shape); +} +// short cut functions +/*! + * \brief a expression that replicate a 1 dimension tensor for nrow times + * \param src Tensor: shape[0] + * \param nrow number of rows to replicate + * \return a expresion with type Tensor size(1), size(0) = nrow + * \tparam Device which device it lies + */ +template +inline Broadcast1DExp +repmat(const expr::Exp &src, index_t nrow) { + return broadcast<1>(src, Shape2(nrow, ShapeCheck<1, SrcExp>::Check(src.self())[0])); +} +//---------------------- +// Execution plan +//---------------------- +/*! \brief execution plan of Broadcast1DExp */ +template +struct Plan, DType> { + public: + static const int dimcast = dimdst - dimdst_m_cast; + Plan(const Broadcast1DExp &e) + : src_(MakePlan(e.src_)), + ystride_(e.shape_.ProdShape(dimcast + 1, dimdst - 1)), + length_(e.shape_[dimcast]) { + TypeCheckPass + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + } + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(0, (y / ystride_) % length_); + } + + private: + expr::Plan src_; + const index_t ystride_, length_; +}; + +/*! \brief execution plan of Broadcast1DExp */ +template +struct Plan, DType>{ + public: + Plan(const Broadcast1DExp &e) + : src_(MakePlan(e.src_)) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const{ + return src_.Eval(0, x); + } + + private: + expr::Plan src_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_BROADCAST_INL_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 99b8dbf44860..24de6ca295c5 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -189,7 +189,7 @@ struct Stream { * \tparam dimension dimension of the tensor * \tparam DType the type of elements in the tensor */ -template +template struct TRValue: public expr::RValueExp { }; // more compact template @@ -199,7 +199,8 @@ struct TRValue: public expr::RValueExp { * \tparam dimension dimension of the tensor * \tparam DType the type of elements in the tensor */ -template +template struct Tensor: public TRValue, Device, dimension, DType> { public: @@ -331,13 +332,13 @@ struct Tensor: : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, index_t stride) : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} - MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(dptr_, shape_.FlatTo2D(), stride_); + MSHADOW_XINLINE Tensor FlatTo2D(void) const { + return Tensor(dptr_, shape_.FlatTo2D(), stride_); } - MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { + MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { Shape<1> s; s[0] = end - begin; - return Tensor(dptr_ + begin, s); + return Tensor(dptr_ + begin, s); } MSHADOW_XINLINE index_t size(index_t i) const { return shape_[0]; @@ -453,9 +454,11 @@ inline void Copy(Tensor dst, * \param dst destination * \param energy input energy */ -inline void Softmax(Tensor dst, const Tensor &energy); +template +inline void Softmax(Tensor dst, const Tensor &energy); /*! \brief refer to comment of cpu ver \sa Softmax */ -inline void Softmax(Tensor dst, const Tensor &energy); +template +inline void Softmax(Tensor dst, const Tensor &energy); // function declarations to support expression, no need to understand them // these functions do not need to be directly used /*! @@ -531,6 +534,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, } // namespace mshadow // include headers #include "./expr_engine-inl.h" +#include "./extension.h" #include "./tensor_cpu-inl.h" #include "./io.h" #include "./tensor_container.h" diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index 1aac6bd26605..71d096e9d89d 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -132,7 +132,7 @@ class TensorContainer: public Tensor { /*! \brief whether we do padding in the space */ bool pad_; /*! \brief the shape of data_ is actually current data space */ - Tensor data_; + Tensor data_; // freespace inline void FreeSpace(void) { if (data_.dptr_ != NULL) { diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index 39742b8aecda..0f94a3dca0d0 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -9,28 +9,11 @@ namespace mshadow{ // Declaration of expressions goes here namespace expr{ - /*! - * \brief broadcast Tensor1D into a higher dimension Tensor - * input: Tensor: ishape[0] - * output: Tensor : oshape[dimcast] = ishape[0] - * \tparam Device which device it lies - * \tparam dimdst target tensor dimension - * \tparam dimcast the dimension where the 1D tensor fills in by index - */ - template - struct Broadcast1DExp: public MakeTensorExp< Broadcast1DExp,Tensor,dimdst>{ - /*! \brief source operand */ - const Tensor src_; - /*! \brief constructor */ - Broadcast1DExp( const Tensor &src, Shape shape ):src_(src){ - this->shape_ = shape; - } - }; /*! * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch * this is a version support unpacking multiple images - * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: + * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: * \tparam SrcExp source expression * \tparam dstdim destination dimension */ @@ -51,17 +34,17 @@ namespace mshadow{ /*! \brief width of img */ index_t i_width_; /*! \brief constructor */ - UnpackPatchToColXExp( const SrcExp &img, index_t psize_y, index_t psize_x, index_t pstride ) + UnpackPatchToColXExp(const SrcExp &img, index_t psize_y, index_t psize_x, index_t pstride) :img_(img), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ - Shape imshape = ShapeCheck::Check( img_ ); - utils::Assert( imshape[0] >= psize_x && imshape[1] >= psize_y, "UnpackPatchToCol:image shape smaller than patch size"); + Shape imshape = ShapeCheck::Check(img_); + utils::Assert(imshape[0] >= psize_x && imshape[1] >= psize_y, "UnpackPatchToCol:image shape smaller than patch size"); this->i_channel_ = imshape[2]; this->i_height_ = imshape[1]; this->i_width_ = imshape[0]; // calculate number of batches - const index_t num = imshape.ProdShape( 3, srcdim ); - const index_t o_height = ( i_height_ - psize_y ) / pstride + 1; - const index_t o_width = ( i_width_ - psize_x ) / pstride + 1; + const index_t num = imshape.ProdShape(3, srcdim); + const index_t o_height = (i_height_ - psize_y) / pstride + 1; + const index_t o_width = (i_width_ - psize_x) / pstride + 1; this->shape_[0] = o_height * o_width * num; this->shape_[1] = psize_y * psize_x * imshape[2]; } @@ -84,13 +67,13 @@ namespace mshadow{ /*! \brief patch stride */ index_t pstride_; /*! \brief constructor */ - PackColToPatchXExp( const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride ) + PackColToPatchXExp(const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride) :mat_(mat), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ this->shape_ = imshape; - const index_t o_height = ( imshape[1] - psize_y ) / pstride + 1; - const index_t o_width = ( imshape[0] - psize_x ) / pstride + 1; - utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" ); - utils::Assert( mat.shape[1] == psize_y * psize_x * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" ); + const index_t o_height = (imshape[1] - psize_y) / pstride + 1; + const index_t o_width = (imshape[0] - psize_x) / pstride + 1; + utils::Assert(mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch"); + utils::Assert(mat.shape[1] == psize_y * psize_x * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch"); } }; @@ -109,9 +92,9 @@ namespace mshadow{ /*! \brief smallest dimension of input */ index_t ishape0_; /*! \brief constructor */ - ReshapeExp( const SrcExp &src, Shape shape ):src_(src){ - Shape ishape = ShapeCheck::Check( src_ ); - utils::Assert( ishape.Size() == shape.Size(), "reshape size must match" ); + ReshapeExp(const SrcExp &src, Shape shape):src_(src){ + Shape ishape = ShapeCheck::Check(src_); + utils::Assert(ishape.Size() == shape.Size(), "reshape size must match"); ishape0_ = ishape[0]; this->shape_ = shape; } @@ -132,9 +115,9 @@ namespace mshadow{ /*! \brief source expression */ const SrcExp& src_; /*! \brief constructor */ - SwapAxisExp( const SrcExp &src ):src_(src){ + SwapAxisExp(const SrcExp &src):src_(src){ this->shape_ = ShapeCheck::Check(src); - std::swap( this->shape_[a1], this->shape_[a2] ); + std::swap(this->shape_[a1], this->shape_[a2]); } }; @@ -155,7 +138,7 @@ namespace mshadow{ /*! \brief source operand, scale of the */ real_t scale_; /*! \brief construct a repmat expression from src and nrow */ - ReduceTo1DExp( const EType& src, real_t scale ):src_(src),scale_(scale){} + ReduceTo1DExp(const EType& src, real_t scale):src_(src),scale_(scale){} }; /*! @@ -179,10 +162,10 @@ namespace mshadow{ /*! \brief source width shape[0] */ index_t src_width_; /*! \brief constructor */ - PoolingExp( const SrcExp &src, index_t ksize_y, index_t ksize_x, index_t kstride ) + PoolingExp(const SrcExp &src, index_t ksize_y, index_t ksize_x, index_t kstride) : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { - Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); - utils::Assert( sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image" ); + Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check(src_); + utils::Assert(sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image"); this->src_height_ = sshape[1]; this->src_width_ = sshape[0]; this->shape_ = sshape; @@ -190,10 +173,10 @@ namespace mshadow{ this->shape_[0] = (src_width_ - ksize_x) / kstride + 1; } /*! \brief constructor, specify shape */ - PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride ) + PoolingExp(const SrcExp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { - Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); - utils::Assert( sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image" ); + Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check(src_); + utils::Assert(sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image"); this->src_height_ = sshape[1]; this->src_width_ = sshape[0]; this->shape_ = sshape; @@ -222,13 +205,13 @@ namespace mshadow{ /*! \brief kernel stride */ index_t kstride_; /*! \brief constructor */ - UnPoolingExp( const Tensor &data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride ) + UnPoolingExp(const Tensor &data_src, const Tensor &data_pooled, + const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride) : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { - utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" ); - utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" ); - utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" ); + utils::Assert(grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch"); + utils::Assert(grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch"); + utils::Assert(grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch"); this->shape_ = data_src_.shape; } }; @@ -251,9 +234,9 @@ namespace mshadow{ /*! \brief source tensor width */ index_t src_width_; /*! \brief constructor */ - PaddingExp( const SrcExp &src, index_t pad_y, index_t pad_x ) + PaddingExp(const SrcExp &src, index_t pad_y, index_t pad_x) : src_(src), pad_y_(pad_y), pad_x_(pad_x) { - this->shape_ = ShapeCheck::Check( src_ ); + this->shape_ = ShapeCheck::Check(src_); src_height_ = this->shape_[1]; src_width_ = this->shape_[0]; this->shape_[1] += pad_y * 2; // height @@ -277,8 +260,8 @@ namespace mshadow{ /*! \brief src height */ index_t src_height_; /*! \brief constructor */ - CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) { - this->shape_ = ShapeCheck::Check( src_ ); + CroppingExp(const SrcExp &src, Shape<2> cshape): src_(src) { + this->shape_ = ShapeCheck::Check(src_); utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); pad_height_ = (this->shape_[1] - cshape[1]) / 2; @@ -288,9 +271,9 @@ namespace mshadow{ this->shape_[0] = cshape[0]; // height } /*! \brief constructor */ - CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width ) + CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width ) : src_(src), pad_height_(start_height), pad_width_(start_width) { - this->shape_ = ShapeCheck::Check( src_ ); + this->shape_ = ShapeCheck::Check(src_); utils::Assert(this->shape_[1] >= cshape[1]+start_height, "CroppingExp: height requirement not met"); utils::Assert(this->shape_[0] >= cshape[0]+start_width, "CroppingExp: width requirement not met"); src_height_ = this->shape_[1]; @@ -311,8 +294,8 @@ namespace mshadow{ /*! \brief source operand */ const SrcExp& src_; /*! \brief constructor */ - MirroringExp( const SrcExp &src ): src_(src) { - this->shape_ = ShapeCheck::Check( src_ ); + MirroringExp(const SrcExp &src): src_(src) { + this->shape_ = ShapeCheck::Check(src_); } }; @@ -329,10 +312,10 @@ namespace mshadow{ /*! \brief neighbor size */ index_t nsize_; /*! \brief constructor */ - ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), nsize_(nsize){ - utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric" ); - this->shape_ = ShapeCheck::Check( src_ ); - utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels" ); + ChannelPoolingExp(const SrcExp &src, index_t nsize): src_(src), nsize_(nsize){ + utils::Assert(nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric"); + this->shape_ = ShapeCheck::Check(src_); + utils::Assert(this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels"); } }; }; // namespace expr @@ -342,39 +325,23 @@ namespace mshadow{ namespace expr{ /*! \brief operator overload */ template - inline ReduceTo1DExp operator*( const ReduceTo1DExp &e, real_t scale ){ - return ReduceTo1DExp( e.src_, e.scale_*scale ); + inline ReduceTo1DExp operator*(const ReduceTo1DExp &e, real_t scale){ + return ReduceTo1DExp(e.src_, e.scale_*scale); } /*! \brief operator overload */ template - inline ReduceTo1DExp operator*( real_t scale, const ReduceTo1DExp &e ){ - return ReduceTo1DExp( e.src_, e.scale_*scale ); - } - - /*! - * \brief a expression that replicate a 1 dimension tensor in dimension dimcast - * \param src Tensor: shape[0] - * \param shape shape of output - * \return a expresion with type Tensor - * \tparam dimcast target dimension where the 1D tensor will be broadcasted - * \tparam Device which device it lies - * \tparam dimdst dimension of destination tensor - */ - template - inline Broadcast1DExp broadcast( const Tensor &src, Shape shape ){ - TypeCheckPass< dimcast::Error_Expression_Does_Not_Meet_Dimension_Req(); - utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape mismatch" ); - return Broadcast1DExp( src, shape ); + inline ReduceTo1DExp operator*(real_t scale, const ReduceTo1DExp &e){ + return ReduceTo1DExp(e.src_, e.scale_*scale); } /*! * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution - * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: + * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: * * weight; shape[1]: out_channel, shape[0]: ichannel*psize_y*psize_x * output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images - * out_height = ( in_height - psize_y ) / pstride + 1, this means we pad inperfect patch with 0 - * out_width = ( in_width - psize_x ) / pstride + 1 + * out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0 + * out_width = (in_width - psize_x) / pstride + 1 * * \return mat target matrix; shape[1]: in_channel*psize_y*psize_x shape[0]: out_height*out_width * num_of_images * \param img source image; shape[2]: in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images) @@ -385,9 +352,9 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline UnpackPatchToColXExp::kDim > unpack_patch2col( const Exp &img, index_t psize_y, index_t psize_x, index_t pstride ){ + inline UnpackPatchToColXExp::kDim > unpack_patch2col(const Exp &img, index_t psize_y, index_t psize_x, index_t pstride){ TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return UnpackPatchToColXExp::kDim >( img.self(), psize_y, psize_x, pstride ); + return UnpackPatchToColXExp::kDim >(img.self(), psize_y, psize_x, pstride); } /*! @@ -401,9 +368,9 @@ namespace mshadow{ * \tparam Device the Device where input data lies */ template - inline PackColToPatchXExp pack_col2patch( const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride ){ - utils::Assert( imshape[0] >= psize_x && imshape[1] >= psize_y, "PackColToPatch:image shape smaller than patch size"); - return PackColToPatchXExp( mat, imshape, psize_y, psize_x, pstride ); + inline PackColToPatchXExp pack_col2patch(const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride){ + utils::Assert(imshape[0] >= psize_x && imshape[1] >= psize_y, "PackColToPatch:image shape smaller than patch size"); + return PackColToPatchXExp(mat, imshape, psize_y, psize_x, pstride); } /*! @@ -416,8 +383,8 @@ namespace mshadow{ * \tparam dimdst target dimension */ template - inline ReshapeExp< SrcExp,dimdst, ExpInfo::kDim > reshape( const Exp &src, Shape oshape ){ - return ReshapeExp< SrcExp,dimdst, ExpInfo::kDim >( src.self(), oshape ); + inline ReshapeExp< SrcExp,dimdst, ExpInfo::kDim > reshape(const Exp &src, Shape oshape){ + return ReshapeExp< SrcExp,dimdst, ExpInfo::kDim >(src.self(), oshape); } /*! @@ -430,10 +397,10 @@ namespace mshadow{ * \tparam etype source expression type */ template - inline SwapAxisExp< SrcExp, ExpInfo::kDim, a1,a2> swapaxis( const Exp &src ){ + inline SwapAxisExp< SrcExp, ExpInfo::kDim, a1,a2> swapaxis(const Exp &src){ typedef ExpInfo Info; TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() ); + return SwapAxisExp< SrcExp,Info::kDim,a1,a2>(src.self()); } /*! @@ -445,8 +412,8 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline ReduceTo1DExp sumall_except_dim( const Exp &exp ){ - return ReduceTo1DExp( exp.self(), 1.0f ); + inline ReduceTo1DExp sumall_except_dim(const Exp &exp){ + return ReduceTo1DExp(exp.self(), 1.0f); } /*! @@ -461,7 +428,7 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline PoolingExp::kDim > pool( const Exp &src, index_t ksize_y, index_t ksize_x, index_t kstride ) { + inline PoolingExp::kDim > pool(const Exp &src, index_t ksize_y, index_t ksize_x, index_t kstride) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); return PoolingExp::kDim >(src.self(), ksize_y, ksize_x, kstride); } @@ -479,7 +446,7 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline PoolingExp::kDim > pool( const Exp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride ) { + inline PoolingExp::kDim > pool(const Exp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); return PoolingExp::kDim >(src.self(), pshape, ksize_y, ksize_x, kstride); } @@ -497,8 +464,8 @@ namespace mshadow{ * \tparam Device device where data lies */ template - inline UnPoolingExp unpool( const Tensor&data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride ) { + inline UnPoolingExp unpool(const Tensor&data_src, const Tensor &data_pooled, + const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride) { return UnPoolingExp(data_src, data_pooled, grad_pooled, ksize_y, ksize_x, kstride); } @@ -541,7 +508,7 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline CroppingExp::kDim> crop( const Exp &src, Shape<2> oshape ) { + inline CroppingExp::kDim> crop(const Exp &src, Shape<2> oshape) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); return CroppingExp::kDim>(src.self(), oshape); } @@ -556,7 +523,7 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline CroppingExp::kDim> crop( const Exp &src, Shape<2> oshape, index_t start_height, index_t start_width ) { + inline CroppingExp::kDim> crop(const Exp &src, Shape<2> oshape, index_t start_height, index_t start_width) { TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); return CroppingExp::kDim>(src.self(), oshape, start_height, start_width); } @@ -584,22 +551,11 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline ChannelPoolingExp::kDim > chpool( const Exp &src, index_t nsize ) { + inline ChannelPoolingExp::kDim > chpool(const Exp &src, index_t nsize) { TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); return ChannelPoolingExp::kDim >(src.self(),nsize); } - // short cut functions - /*! - * \brief a expression that replicate a 1 dimension tensor for nrow times - * \param src Tensor: shape[0] - * \param nrow number of rows to replicate - * \return a expresion with type Tensor shape[0], shape[1] = nrow - * \tparam Device which device it lies - */ - template - inline Broadcast1DExp repmat( const Tensor &src, index_t nrow ){ - return broadcast<0>( src, Shape2( nrow, src.shape[0] ) ); - } + /*! * \brief a expression that sum over rows of a matrix * \param exp input expression that must be a matrix Tensor @@ -608,8 +564,8 @@ namespace mshadow{ * \tparam etype type of expression */ template - inline ReduceTo1DExp sum_rows( const Exp &exp ){ - return sumall_except_dim<0>( exp ); + inline ReduceTo1DExp sum_rows(const Exp &exp){ + return sumall_except_dim<0>(exp); } }; // namespace expr @@ -623,63 +579,35 @@ namespace mshadow{ namespace expr{ template struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp >{ - inline static void Eval( Tensor &dst, const ReduceTo1DExp &exp ){ + inline static void Eval(Tensor &dst, const ReduceTo1DExp &exp){ TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - MapReduceKeepHighDim( dst, exp.src_, exp.scale_ ); + MapReduceKeepHighDim(dst, exp.src_, exp.scale_); } }; template struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp >{ - inline static void Eval( Tensor &dst, const ReduceTo1DExp &exp ){ - MapReduceKeepLowest( dst, exp.src_, exp.scale_ ); + inline static void Eval(Tensor &dst, const ReduceTo1DExp &exp){ + MapReduceKeepLowest(dst, exp.src_, exp.scale_); } }; }; // namespace expr namespace expr{ - /*! \brief execution plan of Broadcast1DExp */ - template - struct Plan< Broadcast1DExp >{ - public: - Plan( const Broadcast1DExp &e ) - : dptr_( e.src_.dptr ), - ystride_( e.shape_.ProdShape(1,dimcast) ), - length_(e.shape_[dimcast]){ - TypeCheckPass< dimcast!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - } - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ (y / ystride_) % length_ ]; - } - private: - const real_t *dptr_; - const index_t ystride_, length_; - }; - /*! \brief execution plan of Broadcast1DExp */ - template - struct Plan< Broadcast1DExp >{ - public: - Plan( const Broadcast1DExp &e ): dptr_( e.src_.dptr ){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ x ]; - } - private: - const real_t *dptr_; - }; }; // namespace expr namespace expr{ template struct Plan< UnpackPatchToColXExp >{ public: - Plan( const UnpackPatchToColXExp &e ) + Plan(const UnpackPatchToColXExp &e) :src_(MakePlan(e.img_)), psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), - o_height_(( i_height_ - psize_y_ ) / pstride_ + 1), - o_width_ (( i_width_ - psize_x_ ) / pstride_ + 1){ + o_height_((i_height_ - psize_y_) / pstride_ + 1), + o_width_ ((i_width_ - psize_x_) / pstride_ + 1){ } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const{ const index_t x_offset = i % psize_x_; const index_t idivp = i / psize_x_; const index_t y_offset = idivp % psize_y_; @@ -689,8 +617,8 @@ namespace mshadow{ const index_t y = (jdivw % o_height_) * pstride_ + y_offset; const index_t n = jdivw / o_height_; - if( x < i_width_ && y < i_height_ ){ - return src_.Eval( ( n * i_channel_ + c ) * i_height_ + y, x ); + if(x < i_width_ && y < i_height_){ + return src_.Eval((n * i_channel_ + c) * i_height_ + y, x); }else{ return 0.0f; } @@ -703,14 +631,14 @@ namespace mshadow{ template struct Plan< PackColToPatchXExp >{ public: - Plan( const PackColToPatchXExp &e ) + Plan(const PackColToPatchXExp &e) :mat_(e.mat_), psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), i_channel_(e.shape_[2]), i_height_(e.shape_[1]), - o_height_(( e.shape_[1] - psize_y_ ) / pstride_ + 1), - o_width_(( e.shape_[0] - psize_x_ ) / pstride_ + 1){ + o_height_((e.shape_[1] - psize_y_) / pstride_ + 1), + o_width_((e.shape_[0] - psize_x_) / pstride_ + 1){ // note: i/o convention are same as unpack } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const{ using namespace std; const index_t y = i % i_height_; const index_t idivh = i / i_height_; @@ -719,11 +647,11 @@ namespace mshadow{ const index_t x = j; const index_t py_min = y < psize_y_ ? 0 : (y-psize_y_+pstride_)/pstride_; const index_t px_min = x < psize_x_ ? 0 : (x-psize_x_+pstride_)/pstride_; - const index_t py_max = min( (y+pstride_)/pstride_, o_height_); - const index_t px_max = min( (x+pstride_)/pstride_, o_width_ ); + const index_t py_max = min((y+pstride_)/pstride_, o_height_); + const index_t px_max = min((x+pstride_)/pstride_, o_width_); real_t res = 0.0f; - for( index_t py = py_min; py < py_max; ++py ){ - for( index_t px = px_min; px < px_max; ++px ){ + for(index_t py = py_min; py < py_max; ++py){ + for(index_t px = px_min; px < px_max; ++px){ res += mat_[ (c * psize_y_ + y - py*pstride_) * psize_x_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ]; } } @@ -739,12 +667,12 @@ namespace mshadow{ template struct Plan< ReshapeExp >{ public: - Plan( const ReshapeExp &e ) + Plan(const ReshapeExp &e) : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){ } - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + MSHADOW_XINLINE real_t Eval(index_t y, index_t x) const{ const index_t idx = y * oshape0_ + x; - return src_.Eval( idx / ishape0_, idx % ishape0_ ); + return src_.Eval(idx / ishape0_, idx % ishape0_); } private: Plan src_; @@ -754,11 +682,11 @@ namespace mshadow{ template struct Plan< ReshapeExp >{ public: - Plan( const ReshapeExp &e ) + Plan(const ReshapeExp &e) : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){ } - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return src_.Eval( 0, y * oshape0_ + x ); + MSHADOW_XINLINE real_t Eval(index_t y, index_t x) const{ + return src_.Eval(0, y * oshape0_ + x); } private: Plan src_; @@ -770,14 +698,14 @@ namespace mshadow{ template struct Plan< SwapAxisExp >{ public: - Plan( const SwapAxisExp &e ) + Plan(const SwapAxisExp &e) : src_(MakePlan(e.src_)), - shape1_( e.shape_.ProdShape( 1, a1 ) ), - shape2_( e.shape_[a1] ), - shape3_( e.shape_.ProdShape( a1+1, a2 ) ), - shape4_( e.shape_[a2] ){ + shape1_(e.shape_.ProdShape(1, a1)), + shape2_(e.shape_[a1]), + shape3_(e.shape_.ProdShape(a1+1, a2)), + shape4_(e.shape_[a2]){ } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const{ const index_t y = i % shape1_; i /= shape1_; const index_t z = i % shape2_; @@ -786,7 +714,7 @@ namespace mshadow{ i /= shape3_; const index_t n = i % shape4_; // swap z and n - return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n ) * shape1_ + y, j ); + return src_.Eval(((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n) * shape1_ + y, j); } private: Plan src_; @@ -796,19 +724,19 @@ namespace mshadow{ template struct Plan< SwapAxisExp >{ public: - Plan( const SwapAxisExp &e ) + Plan(const SwapAxisExp &e) : src_(MakePlan(e.src_)), - shape0_( e.shape_[0] ), - shape1_( e.shape_.ProdShape(1,a2) ), - shape2_( e.shape_[a2] ){ + shape0_(e.shape_[0]), + shape1_(e.shape_.ProdShape(1,a2)), + shape2_(e.shape_[a2]){ } - MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{ + MSHADOW_XINLINE real_t Eval(index_t i, index_t x) const{ // swap x and z const index_t y = i % shape1_; i /= shape1_; const index_t z = i % shape2_; const index_t n = i / shape2_; - return src_.Eval( ( n*shape0_ + x ) * shape1_ + y , z ); + return src_.Eval( (n*shape0_ + x) * shape1_ + y , z); } private: Plan src_; @@ -820,8 +748,8 @@ namespace mshadow{ template struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > { public: - Plan( const PoolingExp &e ) - : src_( MakePlan( e.src_ ) ), ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), + Plan(const PoolingExp &e) + : src_(MakePlan(e.src_)), ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_), src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) { } @@ -829,16 +757,16 @@ namespace mshadow{ using namespace std; const index_t py = i % new_height_; const index_t y_start = py * kstride_; - const index_t y_end = min( y_start + ksize_y_, src_height_ ); + const index_t y_end = min(y_start + ksize_y_, src_height_); const index_t px = j; const index_t x_start = px * kstride_; - const index_t x_end = min( x_start + ksize_x_, src_width_ ); + const index_t x_end = min(x_start + ksize_x_, src_width_); const index_t c = i / new_height_; real_t res = Reducer::kInitV; for (index_t y = y_start; y < y_end; ++y) { for (index_t x = x_start; x < x_end; ++x) { - Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) ); + Reducer::Reduce(res, src_.Eval(c*src_height_+y, x)); } } return res; @@ -865,12 +793,12 @@ namespace mshadow{ const index_t py_min = y < ksize_y_ ? 0 : (y-ksize_y_+kstride_)/kstride_; const index_t px_min = x < ksize_x_ ? 0 : (x-ksize_x_+kstride_)/kstride_; - const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]); - const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]); + const index_t py_max = min((y+kstride_)/kstride_, data_pooled_.shape[1]); + const index_t px_max = min((x+kstride_)/kstride_, data_pooled_.shape[0]); real_t val = 0; - for( index_t py = py_min; py < py_max; ++py ){ - for( index_t px = px_min; px < px_max; ++px ){ + for(index_t py = py_min; py < py_max; ++py){ + for(index_t px = px_min; px < px_max; ++px){ val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px]; } } @@ -940,7 +868,7 @@ namespace mshadow{ Plan(const MirroringExp &e) : src_(MakePlan(e.src_)), width_(e.shape_[0]){} MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - return src_.Eval( i, width_ - j - 1 ); + return src_.Eval(i, width_ - j - 1); } private: Plan src_; @@ -952,8 +880,8 @@ namespace mshadow{ template struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > { public: - Plan( const ChannelPoolingExp &e ) - : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]), + Plan(const ChannelPoolingExp &e) + : src_(MakePlan(e.src_)), channel_(e.shape_[2]), height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){ } MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { @@ -964,10 +892,10 @@ namespace mshadow{ const index_t n = i / channel_; const index_t x = j; const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; - const index_t cend = min( c + hnsize_ + 1, channel_ ); + const index_t cend = min(c + hnsize_ + 1, channel_); real_t res = Reducer::kInitV; - for( index_t cc = cstart; cc < cend; ++ cc ){ - Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + y, x ) ); + for(index_t cc = cstart; cc < cend; ++ cc){ + Reducer::Reduce(res, src_.Eval((n*channel_+cc)*height_ + y, x)); } return res; } @@ -978,38 +906,7 @@ namespace mshadow{ }; }; // namespace mshadow -#if MSHADOW_USE_SSE -// implementations of SSE support, if possible -#include "tensor_sse-inl.hpp" -namespace mshadow{ - namespace expr{ - template - struct SSECheck< Broadcast1DExp >{ - const static bool kPass = true; - }; - template - struct SSEAlignCheck<2, Broadcast1DExp >{ - inline static bool Check( const Broadcast1DExp &exp ){ - return sse2::CheckAlign( exp.src_.dptr ); - } - }; - template - class SSEPlan< Broadcast1DExp >{ - public: - SSEPlan( const Broadcast1DExp &t ) - :dptr_(t.src_.dptr){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::FVec( &dptr_[ x ] ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ x ]; - } - private: - const real_t *dptr_; - }; - }; -}; -#endif + #endif From 91fa1d1409acc4311ec000099de05ec9326a35e7 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 25 Dec 2014 19:14:42 -0800 Subject: [PATCH 022/147] checkin broadcast --- mshadow/extension/broadcast.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h index e5e203c6c58c..47f5a124a95f 100644 --- a/mshadow/extension/broadcast.h +++ b/mshadow/extension/broadcast.h @@ -1,5 +1,5 @@ -#ifndef MSHADOW_EXTENSION_BROADCAST_INL_H_ -#define MSHADOW_EXTENSION_BROADCAST_INL_H_ +#ifndef MSHADOW_EXTENSION_BROADCAST_H_ +#define MSHADOW_EXTENSION_BROADCAST_H_ /*! * Copyright (c) 2014 by Contributors * \file broadcast-inl.h @@ -49,7 +49,8 @@ broadcast(const expr::Exp &src, Shape shape) { ::Error_Expression_Does_Not_Meet_Dimension_Req(); utils::Check(ShapeCheck<1, SrcExp>::Check(src.self())[0] == shape[dimcast], "broadcast, shape mismatch"); - return Broadcast1DExp(src.self(), shape); + return Broadcast1DExp(src.self(), shape); } // short cut functions /*! @@ -62,7 +63,8 @@ broadcast(const expr::Exp &src, Shape shape) { template inline Broadcast1DExp repmat(const expr::Exp &src, index_t nrow) { - return broadcast<1>(src, Shape2(nrow, ShapeCheck<1, SrcExp>::Check(src.self())[0])); + return broadcast<1> + (src, Shape2(nrow, ShapeCheck<1, SrcExp>::Check(src.self())[0])); } //---------------------- // Execution plan @@ -73,7 +75,7 @@ template, DType> { public: static const int dimcast = dimdst - dimdst_m_cast; - Plan(const Broadcast1DExp &e) + explicit Plan(const Broadcast1DExp &e) : src_(MakePlan(e.src_)), ystride_(e.shape_.ProdShape(dimcast + 1, dimdst - 1)), length_(e.shape_[dimcast]) { @@ -93,15 +95,15 @@ struct Plan, DType> { template struct Plan, DType>{ public: - Plan(const Broadcast1DExp &e) + explicit Plan(const Broadcast1DExp &e) : src_(MakePlan(e.src_)) {} - MSHADOW_XINLINE DType Eval(index_t y, index_t x) const{ + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return src_.Eval(0, x); } private: - expr::Plan src_; + expr::Plan src_; }; } // namespace expr } // namespace mshadow -#endif // MSHADOW_EXTENSION_BROADCAST_INL_H_ +#endif // MSHADOW_EXTENSION_BROADCAST_H_ From ed886362884c80d9c7b00e8d3a4a0292461d1fcc Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 04:39:35 -0800 Subject: [PATCH 023/147] checkin reshape --- mshadow/extension/broadcast.h | 8 +- mshadow/extension/pack_col2patch.h | 118 +++++++++++++++++++++++++ mshadow/extension/reshape.h | 87 +++++++++++++++++++ mshadow/extension/unpack_patch2col.h | 123 +++++++++++++++++++++++++++ 4 files changed, 332 insertions(+), 4 deletions(-) create mode 100644 mshadow/extension/pack_col2patch.h create mode 100644 mshadow/extension/reshape.h create mode 100644 mshadow/extension/unpack_patch2col.h diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h index 47f5a124a95f..9ff3bb88b0d7 100644 --- a/mshadow/extension/broadcast.h +++ b/mshadow/extension/broadcast.h @@ -1,11 +1,11 @@ -#ifndef MSHADOW_EXTENSION_BROADCAST_H_ -#define MSHADOW_EXTENSION_BROADCAST_H_ /*! * Copyright (c) 2014 by Contributors - * \file broadcast-inl.h - * \brief definitions of abstract expressions and expressions template + * \file broadcast.h + * \brief support for broadcast and repmat * \author Tianqi Chen */ +#ifndef MSHADOW_EXTENSION_BROADCAST_H_ +#define MSHADOW_EXTENSION_BROADCAST_H_ #include "../extension.h" namespace mshadow { namespace expr { diff --git a/mshadow/extension/pack_col2patch.h b/mshadow/extension/pack_col2patch.h new file mode 100644 index 000000000000..cc8843f37916 --- /dev/null +++ b/mshadow/extension/pack_col2patch.h @@ -0,0 +1,118 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file pack_col2patch.h + * \brief support for pack + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_PACK_COL2PATCH_H_ +#define MSHADOW_EXTENSION_PACK_COL2PATCH_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief reverse operation of UnpackPatchToCol, + * used to backprop gradient back + * this is a version supporting multiple images + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam dstdim destination dimension + */ +template +struct PackColToPatchXExp: + public MakeTensorExp, + SrcExp, dstdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief patch height */ + index_t psize_y_; + /*! \brief patch height */ + index_t psize_x_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief constructor */ + PackColToPatchXExp(const SrcExp &src, Shape imshape, + index_t psize_y, index_t psize_x, index_t pstride) + :src_(src), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ + this->shape_ = imshape; + const index_t o_height = (imshape[dstdim - 2] - psize_y) / pstride + 1; + const index_t o_width = (imshape[dstdim - 1] - psize_x) / pstride + 1; + Shape<2> sshape = ShapeCheck<2, SrcExp>::Check(src_); + utils::Check(sshape[1] == o_height * o_width * + imshape.ProdShape(0, dstdim - 3), + "PackColToPatchExp: src.size(1) mismatch"); + utils::Check(sshape[0] == psize_y * psize_x * imshape[2], + "PackColToPatchExp: src.size(0) mismatch"); + } +}; +/*! + * \brief reverse operation of pack_col2patch, can be used to implement deconvolution + * \return packed img expression + * \param mat source matrix + * \param imshape shape of target img + * \param psize_y height of each patch + * \param psize_x height of each patch + * \param pstride stride of each patch + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam dstdim destination dimension + * \tparam etype type of expression + */ +template +inline PackColToPatchXExp +pack_col2patch(const expr::Exp &src, + Shape imshape, index_t psize_y, + index_t psize_x, index_t pstride) { + TypeCheckPass::kDim == 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(imshape[0] >= psize_x && imshape[1] >= psize_y, + "PackColToPatch:image shape smaller than patch size"); + return PackColToPatchXExp(src.self(), imshape, + psize_y, psize_x, pstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const PackColToPatchXExp &e) + :src_(MakePlan(e.src_)), psize_y_(e.psize_y_), + psize_x_(e.psize_x_), pstride_(e.pstride_), + i_channel_(e.shape_[dstdim - 3]), i_height_(e.shape_[dstdim - 2]), + o_height_((e.shape_[dstdim - 2] - psize_y_) / pstride_ + 1), + o_width_((e.shape_[dstdim - 1] - psize_x_) / pstride_ + 1) { + // note: i/o convention are same as unpack + } + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using std::min; + const index_t y = i % i_height_; + const index_t idivh = i / i_height_; + const index_t c = idivh % i_channel_; + const index_t n = idivh / i_channel_; + const index_t x = j; + const index_t py_min = + y < psize_y_ ? 0 : (y-psize_y_ + pstride_) / pstride_; + const index_t px_min = + x < psize_x_ ? 0 : (x-psize_x_ + pstride_) / pstride_; + const index_t py_max = min((y + pstride_) / pstride_, o_height_); + const index_t px_max = min((x + pstride_) / pstride_, o_width_); + DType res = static_cast(0); + for (index_t py = py_min; py < py_max; ++py) { + for (index_t px = px_min; px < px_max; ++px) { + res += src_.Eval(((c * psize_y_ + y - py*pstride_) * psize_x_ + + x - px * pstride_), + (n * o_height_ + py) * o_width_ + px); + } + } + return res; + } + + private: + Plan src_; + const index_t psize_y_, psize_x_, pstride_, i_channel_; + const index_t i_height_, o_height_, o_width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_PACK_COL2PATCH_H_ diff --git a/mshadow/extension/reshape.h b/mshadow/extension/reshape.h new file mode 100644 index 000000000000..a96bc190bf13 --- /dev/null +++ b/mshadow/extension/reshape.h @@ -0,0 +1,87 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file reshape.h + * \brief support for reshape + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_RESHAPE_H_ +#define MSHADOW_EXTENSION_RESHAPE_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief reshape the content to another shape + * input: Tensor: ishape + * output: Tensor ishape.Size() == oshape.Size() + * \tparam SrcExp source expression + * \tparam dimdst target dimension + * \tparam dimsrc source dimension + */ +template +struct ReshapeExp: + public MakeTensorExp, + SrcExp, dimdst, DType> { + /*! \brief source expression */ + const SrcExp &src_; + /*! \brief smallest dimension of input */ + index_t ishapex_; + /*! \brief constructor */ + ReshapeExp(const SrcExp &src, Shape shape) + : src_(src) { + Shape ishape = ShapeCheck::Check(src_); + utils::Assert(ishape.Size() == shape.Size(), "reshape size must match"); + ishapex_ = ishape[dimsrc - 1]; + this->shape_ = shape; + } +}; +/*! + * \brief a expression that reshapes a tensor to another shape + * \param src Tensor: + * \param oshape target shape + * \return a expresion with type Tensor + * \tparam SrcExp source expression + * \tparam etype source expression type + * \tparam dimdst target dimension + */ +template +inline ReshapeExp::kDim> +reshape(const Exp &src, Shape oshape) { + return ReshapeExp::kDim> + (src.self(), oshape); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ReshapeExp &e) + : src_(MakePlan(e.src_)), + oshapex_(e.shape_[dimdst - 1]), ishapex_(e.ishapex_) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + const index_t idx = y * oshapex_ + x; + return src_.Eval(idx / ishapex_, idx % ishapex_); + } + + private: + Plan src_; + const index_t oshapex_, ishapex_; +}; +// special work plan for 1 dimensional data +template +struct Plan, DType> { + public: + explicit Plan(const ReshapeExp &e) + : src_(MakePlan(e.src_)), oshapex_(e.shape_[dimdst - 1]) { + } + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(0, y * oshapex_ + x); + } + + private: + Plan src_; + const index_t oshapex_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_RESHAPE_H_ diff --git a/mshadow/extension/unpack_patch2col.h b/mshadow/extension/unpack_patch2col.h new file mode 100644 index 000000000000..619baf26bd2a --- /dev/null +++ b/mshadow/extension/unpack_patch2col.h @@ -0,0 +1,123 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file unpack_patch2col.h + * \brief support for unpack + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_ +#define MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief unpack local (overlap) patches of image to column of mat, + * can be used to implement convolution, this expression allow unpack of a batch + * this is a version support unpacking multiple images + * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: + * \tparam SrcExp source expression + * \tparam dstdim destination dimension + */ +template +struct UnpackPatchToColXExp: + public MakeTensorExp, + SrcExp, 2, DType>{ + /*! \brief source operand */ + const SrcExp &img_; + /*! \brief patch height */ + index_t psize_y_; + /*! \brief patch width */ + index_t psize_x_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief number of input channel */ + index_t i_channel_; + /*! \brief height of img */ + index_t i_height_; + /*! \brief width of img */ + index_t i_width_; + /*! \brief constructor */ + UnpackPatchToColXExp(const SrcExp &img, + index_t psize_y, + index_t psize_x, + index_t pstride) + : img_(img), psize_y_(psize_y), + psize_x_(psize_x), pstride_(pstride) { + Shape imshape = ShapeCheck::Check(img_); + utils::Check(imshape[srcdim - 1] >= psize_x && + imshape[srcdim - 2] >= psize_y, + "UnpackPatchToCol:image shape smaller than patch size"); + this->i_channel_ = imshape[srcdim - 3]; + this->i_height_ = imshape[srcdim - 2]; + this->i_width_ = imshape[srcdim - 1]; + // calculate number of batches + const index_t num = imshape.ProdShape(0, srcdim - 3); + const index_t o_height = (i_height_ - psize_y) / pstride + 1; + const index_t o_width = (i_width_ - psize_x) / pstride + 1; + this->shape_[1] = o_height * o_width * num; + this->shape_[0] = psize_y * psize_x * i_channel_; + } +}; + +/*! + * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution + * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: + * + * weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x + * output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images + * out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0 + * out_width = (in_width - psize_x) / pstride + 1 + * + * \return mat target matrix; shape[0]: in_channel*psize_y*psize_x shape[1]: out_height*out_width * num_of_images + * \param img source image; shape[-3]: in_channels, shape[-2]: in_height, shape[-1]: in_width, can be 3D or 4D tensor(multiple images) + * \param psize_y height of each patch + * \param psize_x width of each patch + * \param pstride stride of each patch + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline UnpackPatchToColXExp::kDim> +unpack_patch2col(const Exp &img, + index_t psize_y, index_t psize_x, index_t pstride) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return UnpackPatchToColXExp::kDim> + (img.self(), psize_y, psize_x, pstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const UnpackPatchToColXExp &e) + :src_(MakePlan(e.img_)), + psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), + i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), + o_height_((i_height_ - psize_y_) / pstride_ + 1), + o_width_((i_width_ - psize_x_) / pstride_ + 1) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t x_offset = i % psize_x_; + const index_t idivp = i / psize_x_; + const index_t y_offset = idivp % psize_y_; + const index_t c = idivp / psize_y_; + const index_t x = (j % o_width_) * pstride_ + x_offset; + const index_t jdivw = j / o_width_; + const index_t y = (jdivw % o_height_) * pstride_ + y_offset; + const index_t n = jdivw / o_height_; + if (x < i_width_ && y < i_height_) { + return src_.Eval((n * i_channel_ + c) * i_height_ + y, x); + } else { + return 0.0f; + } + } + + private: + Plan src_; + const index_t psize_y_, psize_x_, pstride_, i_channel_; + const index_t i_height_, i_width_, o_height_, o_width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_ From fbb973d5c6e33d7948a0a56c0bad3f908f04d07b Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 05:22:00 -0800 Subject: [PATCH 024/147] checkin swapaxis --- mshadow/extension/README.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 mshadow/extension/README.md diff --git a/mshadow/extension/README.md b/mshadow/extension/README.md deleted file mode 100644 index 1edc901915eb..000000000000 --- a/mshadow/extension/README.md +++ /dev/null @@ -1 +0,0 @@ -This folder contains complex expressions that pulls multiple expression together From 85980ecfbdcfe3f97f07607a9e726d9c297ba97c Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 05:22:08 -0800 Subject: [PATCH 025/147] checkin swapaxis --- mshadow/extension/swapaxis.h | 109 +++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 mshadow/extension/swapaxis.h diff --git a/mshadow/extension/swapaxis.h b/mshadow/extension/swapaxis.h new file mode 100644 index 000000000000..3fcda22b527e --- /dev/null +++ b/mshadow/extension/swapaxis.h @@ -0,0 +1,109 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file swapaxis.h + * \brief support for swapaxis + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_SWAPAXIS_H_ +#define MSHADOW_EXTENSION_SWAPAXIS_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief swap two axis of a tensor + * input: Tensor: ishape + * output: Tensor oshape[a1],oshape[a2] = ishape[a2],oshape[a1] + * + * \tparam SrcExp type of source expression + * \tparam DType the type of elements + * \tparam dimsrc source dimension, assert a1 > a2 + * \tparam m_a1 one dimension to be swapped, encoded by dimsrc - a1 + * \tparam a2 second dimension to be swapped, encoded by a2 + */ +template +struct SwapAxisExp: + public MakeTensorExp, + SrcExp, dimsrc, DType> { + // decode the a1, a2 + static const int a1 = dimsrc - m_a1; + /*! \brief source expression */ + const SrcExp &src_; + /*! \brief constructor */ + explicit SwapAxisExp(const SrcExp &src) : src_(src) { + this->shape_ = ShapeCheck::Check(src); + std::swap(this->shape_[a1], this->shape_[a2]); + } +}; +/*! + * \brief a expression that reshapes a tensor to another shape + * \param src Tensor: + * \return a expresion with type Tensor + * \tparam a1 higher dimension to be swapped, assert a1 > a2 + * \tparam a2 lower dimension to be swapped + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype source expression type + */ +template +inline SwapAxisExp::kDim, + ExpInfo::kDim - a1, a2> +swapaxis(const Exp &src) { + typedef ExpInfo Info; + TypeCheckPass= a1 + 1 && Info::kDim >= a2 + 1 && + a2 < a1>::Error_Expression_Does_Not_Meet_Dimension_Req(); + return SwapAxisExp::kDim, + ExpInfo::kDim - a1, a2>(src.self()); +} +template +struct Plan, DType> { + public: + // decode the a1 + static const int a1 = dimsrc - m_a1; + explicit Plan(const SwapAxisExp &e) + : src_(MakePlan(e.src_)), + shapey_(e.shape_.ProdShape(a1 + 1, dimsrc - 1)), + shapez_(e.shape_[a1]), + shapec_(e.shape_.ProdShape(a2 + 1, a1)), + shapen_(e.shape_[a2]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t y = i % shapey_; + i /= shapey_; + const index_t z = i % shapez_; + i /= shapez_; + const index_t c = i % shapec_; + i /= shapec_; + const index_t n = i % shapen_; + // swap z and n + return src_.Eval(((((i / shapen_) * shapez_ + z) * shapec_ + + c) * shapen_ + n) * shapey_ + y, j); + } + + private: + Plan src_; + const index_t shapey_, shapez_, shapec_, shapen_; +}; +template +struct Plan, DType> { + public: + explicit Plan(const SwapAxisExp &e) + : src_(MakePlan(e.src_)), + shapex_(e.shape_[dimsrc - 1]), + shapey_(e.shape_.ProdShape(a2 + 1, dimsrc - 1)), + shapez_(e.shape_[a2]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t x) const { + // swap x and z + const index_t y = i % shapey_; + i /= shapey_; + const index_t z = i % shapez_; + const index_t n = i / shapez_; + return src_.Eval((n * shapex_ + x) * shapey_ + y , z); + } + + private: + Plan src_; + const index_t shapex_, shapey_, shapez_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_SWAPAXIS_H_ From 8b0c534d6da39962d48eab354b6280d2cf1bc14e Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 06:02:00 -0800 Subject: [PATCH 026/147] add reduce to 1d --- mshadow/extension/reduceto1d.h | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 mshadow/extension/reduceto1d.h diff --git a/mshadow/extension/reduceto1d.h b/mshadow/extension/reduceto1d.h new file mode 100644 index 000000000000..bb8cdbbbdaf1 --- /dev/null +++ b/mshadow/extension/reduceto1d.h @@ -0,0 +1,82 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file broadcast.h + * \brief support for broadcast and repmat + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_REDUCETO1D_H_ +#define MSHADOW_EXTENSION_REDUCETO1D_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief reduction to 1 dimension tensor + * input: Tensor: ishape + * output: Tensor shape[0] = ishape[dimkeep]; + * + * \tparam SrcExp type of expression to be reduced + * \tparam DType the data type of the scalar + * \tparam Reducer which reducer to use + * \tparam m_dimkeep which dimension to be kept, encoded with dimsrc - dimkeep + */ +template +struct ReduceTo1DExp: + public Exp, + DType, type::kComplex> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief source operand, scale of the */ + DType scale_; + /*! \brief construct a repmat expression from src and nrow */ + ReduceTo1DExp(const SrcExp& src, DType scale) : src_(src), scale_(scale) {} +}; +/*! + * \brief a sum over all dimensions, except dimkeep + * \param exp input expression that must be a matrix Tensor + * \return a expresion with type Tensor + * \tparam dimkeep the dimension that will be kept + * \tparam SrcExp expression + * \tparam etype type of expression + */ +template +inline ReduceTo1DExp::kDim - dimkeep> +sumall_except_dim(const Exp &exp) { + return ReduceTo1DExp::kDim - dimkeep>(exp.self(), 1); +} +/*! + * \brief a expression that sum over rows of a matrix + * \param exp input expression that must be a matrix Tensor + * \return a expresion with type Tensor + * \tparam SrcExp expression + * \tparam etype type of expression + */ +template +inline ReduceTo1DExp +sum_rows(const Exp &exp) { + TypeCheckPass::kDim ==2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return sumall_except_dim<1>(exp); +} +template +struct ExpComplexEngine, DType> { + static const int dimkeep = ExpInfo::kDim - m_dimkeep; + inline static void Eval(Tensor *dst, + const ReduceTo1DExp &exp) { + TypeCheckPass::Error_Expression_Does_Not_Meet_Dimension_Req(); + MapReduceKeepHighDim(dst, exp.src_, exp.scale_); + } +}; +template +struct ExpComplexEngine, DType> { + inline static void Eval(Tensor *dst, + const ReduceTo1DExp &exp) { + MapReduceKeepLowest(dst, exp.src_, exp.scale_); + } +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_REDUCETO1D_H_ From 303362f8cc77fbd8378520e68737f2f007059f5d Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 06:08:33 -0800 Subject: [PATCH 027/147] change --- mshadow/base.h | 12 +- .../{expression-inl.h => expr_scalar-inl.h} | 13 + mshadow/expression.h | 15 +- mshadow/extension.h | 5 + mshadow/tensor.h | 14 + mshadow/tensor_cpu-inl.h | 12 +- mshadow/tensor_expr_ext.h | 384 +----------------- 7 files changed, 46 insertions(+), 409 deletions(-) rename mshadow/{expression-inl.h => expr_scalar-inl.h} (89%) diff --git a/mshadow/base.h b/mshadow/base.h index 1301f2043fbc..640086883bf3 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -121,8 +121,8 @@ extern "C" { * template arguments can be detected */ #ifndef MSHADOW_DEFAULT_DTYPE -#define MSHADOW_DEFAULT_DTYPE = default_real_t -//#define MSHADOW_DEFAULT_DTYPE +//#define MSHADOW_DEFAULT_DTYPE = default_real_t +#define MSHADOW_DEFAULT_DTYPE #endif /*! \brief namespace for mshadow */ @@ -275,8 +275,8 @@ struct sum { return 1; } template - MSHADOW_XINLINE static DType InitValue(void) { - return 0; + MSHADOW_XINLINE static void SetInitValue(DType &initv) { + initv = 0; } }; /*! \brief helper namespace to get the limits */ @@ -313,8 +313,8 @@ struct maximum { return redres == redsrc ? 1: 0; } template - MSHADOW_XINLINE static DType InitValue(void) { - return limits::MinValue(); + MSHADOW_XINLINE static void SetInitValue(DType &initv) { + initv = limits::MinValue(); } }; } // namespace red diff --git a/mshadow/expression-inl.h b/mshadow/expr_scalar-inl.h similarity index 89% rename from mshadow/expression-inl.h rename to mshadow/expr_scalar-inl.h index 77ebe6f69fdb..a0efdc1ab649 100644 --- a/mshadow/expression-inl.h +++ b/mshadow/expr_scalar-inl.h @@ -27,6 +27,19 @@ operator*(MSHADOW_SCALAR_ lhs, MSHADOW_SCALAR_>(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); } +/*! \brief operator overload */ +template +inline ReduceTo1DExp +operator*(const ReduceTo1DExp &e, MSHADOW_SCALAR_ scale) { + return ReduceTo1DExp(e.src_, e.scale_ * scale); +} +/*! \brief operator overload */ +template +inline ReduceTo1DExp +operator*(MSHADOW_SCALAR_ scale, const ReduceTo1DExp &e) { + return ReduceTo1DExp(e.src_, e.scale_ * scale); +} + /*! \brief operator overload for const */ template inline BinaryMapExp, diff --git a/mshadow/expression.h b/mshadow/expression.h index fe8f50773b87..b5c26d5e65df 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -208,6 +208,7 @@ class RValueExp: public Exp { * \tparam TB type of rhs * \tparam ltrans whether lhs is transposed * \tparam rtrans whether rhs is transposed + * \tparam DType the data type of the scalar */ template struct DotExp: public Exp, @@ -356,18 +357,4 @@ F(const Exp &src) { } } // namespace expr } // namespace mshadow -// add definition of scalar related operators -#ifdef MSAHDOW_SCALAR_ - #error "MSHADOW_SCALAR_ must not be defined" -#endif -// enumerate all the scalar data type we aim to be good at -#define MSHADOW_SCALAR_ float -#include "./expression-inl.h" -#undef MSHADOW_SCALAR_ -#define MSHADOW_SCALAR_ double -#include "./expression-inl.h" -#undef MSHADOW_SCALAR_ -#define MSHADOW_SCALAR_ int -#include "./expression-inl.h" -#undef MSHADOW_SCALAR_ #endif // MSHADOW_EXPRESSION_H_ diff --git a/mshadow/extension.h b/mshadow/extension.h index 90c1c13291ae..0e9089ccc6ba 100644 --- a/mshadow/extension.h +++ b/mshadow/extension.h @@ -9,5 +9,10 @@ #define MSHADOW_EXTENSION_H_ #include "./expr_engine-inl.h" #include "./extension/broadcast.h" +#include "./extension/unpack_patch2col.h" +#include "./extension/pack_col2patch.h" +#include "./extension/reshape.h" +#include "./extension/swapaxis.h" +#include "./extension/reduceto1d.h" #endif diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 24de6ca295c5..fa9ebf4fb36e 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -538,4 +538,18 @@ inline void MapReduceKeepHighDim(TRValue *dst, #include "./tensor_cpu-inl.h" #include "./io.h" #include "./tensor_container.h" +// add definition of scalar related operators +#ifdef MSAHDOW_SCALAR_ + #error "MSHADOW_SCALAR_ must not be defined" +#endif +// enumerate all the scalar data type we aim to be good at +#define MSHADOW_SCALAR_ float +#include "./expr_scalar-inl.h" +#undef MSHADOW_SCALAR_ +#define MSHADOW_SCALAR_ double +#include "./expr_scalar-inl.h" +#undef MSHADOW_SCALAR_ +#define MSHADOW_SCALAR_ int +#include "./expr_scalar-inl.h" +#undef MSHADOW_SCALAR_ #endif // MSHADOW_TENSOR_H_ diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 3ea32d038b72..e216daabb733 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -116,9 +116,9 @@ inline void MapReduceKeepLowest(TRValue *dst, ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); Shape<2> eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()).FlatTo2D(); - utils::Check(eshape[0] == dst->self().size(0), - "reduction dimension do not match"); - utils::Check(eshape[1] != 0, "can not reduce over empty tensor"); + utils::Check(eshape[1] == dst->self().size(0), + "MapReduceKeepLowest::reduction dimension do not match"); + utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); // execution expr::Plan dplan = MakePlan(dst->self()); expr::Plan splan = MakePlan(exp.self()); @@ -142,7 +142,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, EShape eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()); utils::Check(eshape[dimkeep] == dst->self().size(0), - "reduction dimension do not match"); + "MapReduceKeepHighDim::reduction dimension do not match"); // use equvalent form Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), eshape[dimkeep], @@ -152,9 +152,9 @@ inline void MapReduceKeepHighDim(TRValue *dst, expr::Plan dplan = MakePlan(dst->self()); expr::Plan splan = MakePlan(exp.self()); for (index_t c = 0; c < pshape[1]; ++c) { - DType res = Reducer::kInitV; + DType res; Reducer::SetInitValue(res); for (index_t n = 0; n < pshape[0]; ++n) { - DType tres = Reducer::kInitV; + DType tres; Reducer::SetInitValue(tres); for (index_t y = 0; y < pshape[2]; ++y) { for (index_t x = 0; x < pshape[3]; ++x) { Reducer::Reduce(tres, diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index 0f94a3dca0d0..b47fced65afa 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -10,136 +10,6 @@ namespace mshadow{ // Declaration of expressions goes here namespace expr{ - /*! - * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch - * this is a version support unpacking multiple images - * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: - * \tparam SrcExp source expression - * \tparam dstdim destination dimension - */ - template - struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp, SrcExp, 2>{ - /*! \brief source operand */ - const SrcExp& img_; - /*! \brief patch height */ - index_t psize_y_; - /*! \brief patch width */ - index_t psize_x_; - /*! \brief patch stride */ - index_t pstride_; - /*! \brief number of input channel */ - index_t i_channel_; - /*! \brief height of img */ - index_t i_height_; - /*! \brief width of img */ - index_t i_width_; - /*! \brief constructor */ - UnpackPatchToColXExp(const SrcExp &img, index_t psize_y, index_t psize_x, index_t pstride) - :img_(img), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ - Shape imshape = ShapeCheck::Check(img_); - utils::Assert(imshape[0] >= psize_x && imshape[1] >= psize_y, "UnpackPatchToCol:image shape smaller than patch size"); - this->i_channel_ = imshape[2]; - this->i_height_ = imshape[1]; - this->i_width_ = imshape[0]; - // calculate number of batches - const index_t num = imshape.ProdShape(3, srcdim); - const index_t o_height = (i_height_ - psize_y) / pstride + 1; - const index_t o_width = (i_width_ - psize_x) / pstride + 1; - this->shape_[0] = o_height * o_width * num; - this->shape_[1] = psize_y * psize_x * imshape[2]; - } - }; - - /*! - * \brief reverse operation of UnpackPatchToCol, used to backprop gradient back - * this is a version supporting multiple images - * \tparam Device which device it lies - * \tparam dstdim destination dimension - */ - template - struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp, Tensor, dstdim>{ - /*! \brief source operand */ - const Tensor& mat_; - /*! \brief patch height */ - index_t psize_y_; - /*! \brief patch height */ - index_t psize_x_; - /*! \brief patch stride */ - index_t pstride_; - /*! \brief constructor */ - PackColToPatchXExp(const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride) - :mat_(mat), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ - this->shape_ = imshape; - const index_t o_height = (imshape[1] - psize_y) / pstride + 1; - const index_t o_width = (imshape[0] - psize_x) / pstride + 1; - utils::Assert(mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch"); - utils::Assert(mat.shape[1] == psize_y * psize_x * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch"); - } - }; - - /*! - * \brief reshape the content to another shape - * input: Tensor: ishape - * output: Tensor ishape.Size() == oshape.Size() - * \tparam SrcExp source expression - * \tparam dimdst target dimension - * \tparam dimsrc source dimension - */ - template - struct ReshapeExp: public MakeTensorExp< ReshapeExp, SrcExp, dimdst>{ - /*! \brief source expression */ - const SrcExp& src_; - /*! \brief smallest dimension of input */ - index_t ishape0_; - /*! \brief constructor */ - ReshapeExp(const SrcExp &src, Shape shape):src_(src){ - Shape ishape = ShapeCheck::Check(src_); - utils::Assert(ishape.Size() == shape.Size(), "reshape size must match"); - ishape0_ = ishape[0]; - this->shape_ = shape; - } - }; - - /*! - * \brief swap two axis of a tensor - * input: Tensor: ishape - * output: Tensor oshape[a1],oshape[a2] = ishape[a2],oshape[a1] - * - * \tparam SrcExp type of source expression - * \tparam dimsrc source dimension - * \tparam a1 smaller dimension to be swapped - * \tparam a2 larger dimension to be swapped - */ - template - struct SwapAxisExp: public MakeTensorExp< SwapAxisExp, SrcExp, dimsrc>{ - /*! \brief source expression */ - const SrcExp& src_; - /*! \brief constructor */ - SwapAxisExp(const SrcExp &src):src_(src){ - this->shape_ = ShapeCheck::Check(src); - std::swap(this->shape_[a1], this->shape_[a2]); - } - }; - - /*! - * \brief reduction to 1 dimension tensor - * input: Tensor: ishape - * output: Tensor shape[0] = ishape[dimkeep]; - * - * \tparam EType type of expression to be reduced - * \tparam Reducer which reducer to use - * \tparam srcdim dimension of source - * \tparam dimkeep which dimension to be kept, - */ - template - struct ReduceTo1DExp: public Exp< ReduceTo1DExp, type::kComplex >{ - /*! \brief source operand */ - const EType& src_; - /*! \brief source operand, scale of the */ - real_t scale_; - /*! \brief construct a repmat expression from src and nrow */ - ReduceTo1DExp(const EType& src, real_t scale):src_(src),scale_(scale){} - }; /*! * \brief pooling expression, do reduction over local patches of a image @@ -323,98 +193,7 @@ namespace mshadow{ // Declaration of all functions go here namespace expr{ - /*! \brief operator overload */ - template - inline ReduceTo1DExp operator*(const ReduceTo1DExp &e, real_t scale){ - return ReduceTo1DExp(e.src_, e.scale_*scale); - } - /*! \brief operator overload */ - template - inline ReduceTo1DExp operator*(real_t scale, const ReduceTo1DExp &e){ - return ReduceTo1DExp(e.src_, e.scale_*scale); - } - - /*! - * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution - * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: - * - * weight; shape[1]: out_channel, shape[0]: ichannel*psize_y*psize_x - * output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images - * out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0 - * out_width = (in_width - psize_x) / pstride + 1 - * - * \return mat target matrix; shape[1]: in_channel*psize_y*psize_x shape[0]: out_height*out_width * num_of_images - * \param img source image; shape[2]: in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images) - * \param psize_y height of each patch - * \param psize_x width of each patch - * \param pstride stride of each patch - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline UnpackPatchToColXExp::kDim > unpack_patch2col(const Exp &img, index_t psize_y, index_t psize_x, index_t pstride){ - TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return UnpackPatchToColXExp::kDim >(img.self(), psize_y, psize_x, pstride); - } - /*! - * \brief reverse operation of pack_col2patch, can be used to implement deconvolution - * \return packed img expression - * \param mat source matrix - * \param imshape shape of target img - * \param psize_y height of each patch - * \param psize_x height of each patch - * \param pstride stride of each patch - * \tparam Device the Device where input data lies - */ - template - inline PackColToPatchXExp pack_col2patch(const Tensor &mat, Shape imshape, index_t psize_y, index_t psize_x, index_t pstride){ - utils::Assert(imshape[0] >= psize_x && imshape[1] >= psize_y, "PackColToPatch:image shape smaller than patch size"); - return PackColToPatchXExp(mat, imshape, psize_y, psize_x, pstride); - } - - /*! - * \brief a expression that reshapes a tensor to another shape - * \param src Tensor: - * \param oshape target shape - * \return a expresion with type Tensor - * \tparam SrcExp source expression - * \tparam etype source expression type - * \tparam dimdst target dimension - */ - template - inline ReshapeExp< SrcExp,dimdst, ExpInfo::kDim > reshape(const Exp &src, Shape oshape){ - return ReshapeExp< SrcExp,dimdst, ExpInfo::kDim >(src.self(), oshape); - } - - /*! - * \brief a expression that reshapes a tensor to another shape - * \param src Tensor: - * \return a expresion with type Tensor - * \tparam a1 smaller dimension to be swapped - * \tparam a2 larger dimension to be swapped - * \tparam SrcExp source expression - * \tparam etype source expression type - */ - template - inline SwapAxisExp< SrcExp, ExpInfo::kDim, a1,a2> swapaxis(const Exp &src){ - typedef ExpInfo Info; - TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return SwapAxisExp< SrcExp,Info::kDim,a1,a2>(src.self()); - } - - /*! - * \brief a sum over all dimensions, except dimkeep - * \param exp input expression that must be a matrix Tensor - * \return a expresion with type Tensor - * \tparam dimkeep the dimension that will be kept - * \tparam SrcExp expression - * \tparam etype type of expression - */ - template - inline ReduceTo1DExp sumall_except_dim(const Exp &exp){ - return ReduceTo1DExp(exp.self(), 1.0f); - } /*! * \brief pooling subregion results together @@ -556,17 +335,6 @@ namespace mshadow{ return ChannelPoolingExp::kDim >(src.self(),nsize); } - /*! - * \brief a expression that sum over rows of a matrix - * \param exp input expression that must be a matrix Tensor - * \return a expresion with type Tensor - * \tparam SrcExp expression - * \tparam etype type of expression - */ - template - inline ReduceTo1DExp sum_rows(const Exp &exp){ - return sumall_except_dim<0>(exp); - } }; // namespace expr }; // namespace mshadow @@ -577,20 +345,6 @@ namespace mshadow{ // -------------------------------------------------- namespace mshadow{ namespace expr{ - template - struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp >{ - inline static void Eval(Tensor &dst, const ReduceTo1DExp &exp){ - TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - MapReduceKeepHighDim(dst, exp.src_, exp.scale_); - } - }; - - template - struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp >{ - inline static void Eval(Tensor &dst, const ReduceTo1DExp &exp){ - MapReduceKeepLowest(dst, exp.src_, exp.scale_); - } - }; }; // namespace expr namespace expr{ @@ -598,150 +352,14 @@ namespace mshadow{ }; // namespace expr namespace expr{ - template - struct Plan< UnpackPatchToColXExp >{ - public: - Plan(const UnpackPatchToColXExp &e) - :src_(MakePlan(e.img_)), psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), - i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), - o_height_((i_height_ - psize_y_) / pstride_ + 1), - o_width_ ((i_width_ - psize_x_) / pstride_ + 1){ - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const{ - const index_t x_offset = i % psize_x_; - const index_t idivp = i / psize_x_; - const index_t y_offset = idivp % psize_y_; - const index_t c = idivp / psize_y_; - const index_t x = (j % o_width_) * pstride_ + x_offset; - const index_t jdivw = j / o_width_; - const index_t y = (jdivw % o_height_) * pstride_ + y_offset; - const index_t n = jdivw / o_height_; - - if(x < i_width_ && y < i_height_){ - return src_.Eval((n * i_channel_ + c) * i_height_ + y, x); - }else{ - return 0.0f; - } - } - private: - Plan src_; - const index_t psize_y_, psize_x_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_; - }; - template - struct Plan< PackColToPatchXExp >{ - public: - Plan(const PackColToPatchXExp &e) - :mat_(e.mat_), psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), - i_channel_(e.shape_[2]), i_height_(e.shape_[1]), - o_height_((e.shape_[1] - psize_y_) / pstride_ + 1), - o_width_((e.shape_[0] - psize_x_) / pstride_ + 1){ - // note: i/o convention are same as unpack - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const{ - using namespace std; - const index_t y = i % i_height_; - const index_t idivh = i / i_height_; - const index_t c = idivh % i_channel_; - const index_t n = idivh / i_channel_; - const index_t x = j; - const index_t py_min = y < psize_y_ ? 0 : (y-psize_y_+pstride_)/pstride_; - const index_t px_min = x < psize_x_ ? 0 : (x-psize_x_+pstride_)/pstride_; - const index_t py_max = min((y+pstride_)/pstride_, o_height_); - const index_t px_max = min((x+pstride_)/pstride_, o_width_); - real_t res = 0.0f; - for(index_t py = py_min; py < py_max; ++py){ - for(index_t px = px_min; px < px_max; ++px){ - res += mat_[ (c * psize_y_ + y - py*pstride_) * psize_x_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ]; - } - } - return res; - } - private: - Tensor mat_; - const index_t psize_y_, psize_x_, pstride_, i_channel_, i_height_, o_height_, o_width_; - }; - }; + }; namespace expr{ - template - struct Plan< ReshapeExp >{ - public: - Plan(const ReshapeExp &e) - : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){ - } - MSHADOW_XINLINE real_t Eval(index_t y, index_t x) const{ - const index_t idx = y * oshape0_ + x; - return src_.Eval(idx / ishape0_, idx % ishape0_); - } - private: - Plan src_; - const index_t oshape0_, ishape0_; - }; - // special work plan for 1 dimensional data - template - struct Plan< ReshapeExp >{ - public: - Plan(const ReshapeExp &e) - : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){ - } - MSHADOW_XINLINE real_t Eval(index_t y, index_t x) const{ - return src_.Eval(0, y * oshape0_ + x); - } - private: - Plan src_; - const index_t oshape0_; - }; }; namespace expr{ - template - struct Plan< SwapAxisExp >{ - public: - Plan(const SwapAxisExp &e) - : src_(MakePlan(e.src_)), - shape1_(e.shape_.ProdShape(1, a1)), - shape2_(e.shape_[a1]), - shape3_(e.shape_.ProdShape(a1+1, a2)), - shape4_(e.shape_[a2]){ - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const{ - const index_t y = i % shape1_; - i /= shape1_; - const index_t z = i % shape2_; - i /= shape2_; - const index_t c = i % shape3_; - i /= shape3_; - const index_t n = i % shape4_; - // swap z and n - return src_.Eval(((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n) * shape1_ + y, j); - } - private: - Plan src_; - const index_t shape1_, shape2_, shape3_, shape4_; - }; - template - struct Plan< SwapAxisExp >{ - public: - Plan(const SwapAxisExp &e) - : src_(MakePlan(e.src_)), - shape0_(e.shape_[0]), - shape1_(e.shape_.ProdShape(1,a2)), - shape2_(e.shape_[a2]){ - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t x) const{ - // swap x and z - const index_t y = i % shape1_; - i /= shape1_; - const index_t z = i % shape2_; - const index_t n = i / shape2_; - return src_.Eval( (n*shape0_ + x) * shape1_ + y , z); - } - private: - Plan src_; - const index_t shape0_, shape1_, shape2_; - }; }; namespace expr{ From 687b4a62c3e9845fd582b5b8a8707b22d4b87aa3 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 17:19:12 -0800 Subject: [PATCH 028/147] add pooling --- mshadow/extension/pooling.h | 138 +++++++++++++++++++++++++++++++++ mshadow/extension/reduceto1d.h | 15 ++-- 2 files changed, 148 insertions(+), 5 deletions(-) create mode 100644 mshadow/extension/pooling.h diff --git a/mshadow/extension/pooling.h b/mshadow/extension/pooling.h new file mode 100644 index 000000000000..f1ce146d53aa --- /dev/null +++ b/mshadow/extension/pooling.h @@ -0,0 +1,138 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file pooling.h + * \brief support for pooling + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_POOLING_H_ +#define MSHADOW_EXTENSION_POOLING_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief pooling expression, do reduction over local patches of a image + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the content data type + * \tparam srcdim dimension of src + */ +template +struct PoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief kernel size in height */ + index_t ksize_y_; + /*! \brief kernel size in width */ + index_t ksize_x_; + /*! \brief kernel stride */ + index_t kstride_; + /*! \brief source height shape[1] */ + index_t src_height_; + /*! \brief source width shape[0] */ + index_t src_width_; + /*! \brief constructor */ + PoolingExp(const SrcExp &src, index_t ksize_y, index_t ksize_x, index_t kstride) + : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { + Shape sshape = ShapeCheck::Check(src_); + utils::Check(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y, + "PoolingExp: kernel must be smaller than image"); + this->src_height_ = sshape[srcdim - 2]; + this->src_width_ = sshape[srcdim - 1]; + this->shape_ = sshape; + this->shape_[srcdim - 2] = (src_height_ - ksize_y) / kstride + 1; + this->shape_[srcdim - 1] = (src_width_ - ksize_x) / kstride + 1; + } + /*! \brief constructor, specify shape */ + PoolingExp(const SrcExp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) + : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { + Shape sshape = ShapeCheck::Check(src_); + utils::Assert(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y, + "PoolingExp: kernel must be smaller than image"); + this->src_height_ = sshape[srcdim - 2]; + this->src_width_ = sshape[srcdim - 1]; + this->shape_ = sshape; + this->shape_[srcdim - 2] = pshape[0]; + this->shape_[srcdim - 1] = pshape[1]; + } +}; +/*! + * \brief pooling subregion results together + * \param src source image, shape: (batch, channel, height, width) + * \param ksize_y kernel size in height + * \param ksize_x kernel size in width + * \param kstride stride for each kernel + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PoolingExp::kDim> +pool(const Exp &src, index_t ksize_y, index_t ksize_x, index_t kstride) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PoolingExp::kDim> + (src.self(), ksize_y, ksize_x, kstride); +} +/*! + * \brief same as pool, except the output shape is specified by pshape + * \param src source image + * \param pshape ouput shape + * \param ksize_y kernel size in y + * \param ksize_x kernel size in x + * \param kstride stride for each kernel + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PoolingExp::kDim> +pool(const Exp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PoolingExp::kDim> + (src.self(), pshape, ksize_y, ksize_x, kstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const PoolingExp &e) + : src_(MakePlan(e.src_)), + ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_), + src_height_(e.src_height_),src_width_(e.src_width_), + new_height_(e.shape_[srcdim - 2]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using std::min; + const index_t py = i % new_height_; + const index_t y_start = py * kstride_; + const index_t y_end = min(y_start + ksize_y_, src_height_); + const index_t px = j; + const index_t x_start = px * kstride_; + const index_t x_end = min(x_start + ksize_x_, src_width_); + const index_t c = i / new_height_; + + DType res; Reducer::SetInitValue(res); + for (index_t y = y_start; y < y_end; ++y) { + for (index_t x = x_start; x < x_end; ++x) { + Reducer::Reduce(res, src_.Eval(c * src_height_ + y, x)); + } + } + return res; + } + private: + Plan src_; + const index_t ksize_y_, ksize_x_, kstride_; + const index_t src_height_, src_width_; + const index_t new_height_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_POOLING_H_ diff --git a/mshadow/extension/reduceto1d.h b/mshadow/extension/reduceto1d.h index bb8cdbbbdaf1..b52157451ccd 100644 --- a/mshadow/extension/reduceto1d.h +++ b/mshadow/extension/reduceto1d.h @@ -56,22 +56,27 @@ template inline ReduceTo1DExp sum_rows(const Exp &exp) { TypeCheckPass::kDim ==2> - ::Error_Expression_Does_Not_Meet_Dimension_Req(); + ::Error_Expression_Does_Not_Meet_Dimension_Req(); return sumall_except_dim<1>(exp); } template -struct ExpComplexEngine, DType> { +struct ExpComplexEngine, + DType> { static const int dimkeep = ExpInfo::kDim - m_dimkeep; inline static void Eval(Tensor *dst, - const ReduceTo1DExp &exp) { - TypeCheckPass::Error_Expression_Does_Not_Meet_Dimension_Req(); + const ReduceTo1DExp &exp) { + TypeCheckPass + ::Error_Expression_Does_Not_Meet_Dimension_Req(); MapReduceKeepHighDim(dst, exp.src_, exp.scale_); } }; template -struct ExpComplexEngine, DType> { +struct ExpComplexEngine, DType> { inline static void Eval(Tensor *dst, const ReduceTo1DExp &exp) { MapReduceKeepLowest(dst, exp.src_, exp.scale_); From 85ed2dbfab28fe367469985186f1097de8542275 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 18:39:21 -0800 Subject: [PATCH 029/147] checkin chpool --- mshadow/extension/channel_pool.h | 88 ++++++++++++ .../extension/{pooling.h => spatial_pool.h} | 0 mshadow/extension/spatial_unpool.h | 127 ++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 mshadow/extension/channel_pool.h rename mshadow/extension/{pooling.h => spatial_pool.h} (100%) create mode 100644 mshadow/extension/spatial_unpool.h diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h new file mode 100644 index 000000000000..f8c3e46f95eb --- /dev/null +++ b/mshadow/extension/channel_pool.h @@ -0,0 +1,88 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file broadcast.h + * \brief support for broadcast and repmat + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_CHANNEL_POOL_H_ +#define MSHADOW_EXTENSION_CHANNEL_POOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief channel pooling expression, do reduction over (local nearby) channels, + * used to implement local response normalization + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam srcdim dimension of src + */ +template +struct ChannelPoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief neighbor size */ + index_t nsize_; + /*! \brief constructor */ + ChannelPoolingExp(const SrcExp &src, index_t nsize) + : src_(src), nsize_(nsize) { + utils::Check(nsize % 2 == 1, + "chpool: local size must be odd"); + this->shape_ = ShapeCheck::Check(src_); + utils::Check(this->shape_[srcdim - 3] >= nsize_, + "chpool: local size must be smaller than nchannels"); + } +}; +/*! + * \brief channel pooling, do reduction over (local nearby) channels, + * used to implement local response normalization + * \param src source data + * \param nsize neighbor size + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam etype type of expression + */ +template +inline ChannelPoolingExp::kDim> +chpool(const Exp &src, index_t nsize) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelPoolingExp::kDim>(src.self(), nsize); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ChannelPoolingExp &e) + : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]), + height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]), + hnsize_(e.nsize_ / 2) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using std::min; + const index_t y = i % height_; + i /= height_; + const index_t c = i % channel_; + const index_t n = i / channel_; + const index_t x = j; + const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; + const index_t cend = min(c + hnsize_ + 1, channel_); + DType res; Reducer::SetInitValue(res); + for (index_t cc = cstart; cc < cend; ++cc) { + Reducer::Reduce(res, src_.Eval((n * channel_ + cc) * height_ + y, x)); + } + return res; + } + private: + Plan src_; + const index_t channel_, height_, width_, hnsize_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CHANNEL_POOL_H_ + diff --git a/mshadow/extension/pooling.h b/mshadow/extension/spatial_pool.h similarity index 100% rename from mshadow/extension/pooling.h rename to mshadow/extension/spatial_pool.h diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h new file mode 100644 index 000000000000..a75463c1a9fb --- /dev/null +++ b/mshadow/extension/spatial_unpool.h @@ -0,0 +1,127 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file broadcast.h + * \brief support for broadcast and repmat + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ +#define MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief unpooling expr reverse operation of pooling, used to pass gradient back + * \tparam Reducer specifies reduction operation during pooling + * \tparam Device which device it lies + */ +template +struct UnPoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source input, corresponds to src in pooling */ + const SrcExp &data_src_; + /*! \brief result of pooled data, corresponds to result of pooling */ + const SrcExp &data_pooled_; + /*! \brief gradient data of pooled part, to be propgate down */ + const SrcExp &grad_pooled_; + /*! \brief shape of pooled expression */ + index_t pshape_y_; + /*! \brief shape of pooled expression */ + index_t pshape_x_; + /*! \brief kernel size in height */ + index_t ksize_y_; + /*! \brief kernel size in width */ + index_t ksize_x_; + /*! \brief kernel stride */ + index_t kstride_; + /*! \brief constructor */ + UnPoolingExp(const SrcExp &data_src, + const SrcExp &data_pooled, + const SrcExp &grad_pooled, + index_t ksize_y, index_t ksize_x, index_t kstride) + : data_src_(data_src), data_pooled_(data_pooled), + grad_pooled_(grad_pooled), + ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { + Shape pshape = ShapeCheck::Check(grad_pooled); + utils::Check(pshape == ShapeCheck::Check(data_pooled), + "UnPoolingExp: pooled shape mismatch"); + Shape sshape = ShapeCheck::Check(data_src); + for (int k = 0; k < srcdim - 2; ++k) { + utils::Check(pshape[k] == sshape[k], + "UnPoolingExp: pool and src shape mismatch"); + } + pshape_x_ = pshape[srcdim - 1]; + pshape_y_ = pshape[srcdim - 2]; + this->shape_ = sshape; + } +}; +/*! + * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling, + * same as unpooling, but allows unequal size of kernel + * \param data_src source input, corresponds to src in pooling + * \param data_pooled result of pooled data, corresponds to result of pooling + * \param grad_pooled gradient data of pooled part, to be propgate down + * \param ksize_y kernel height + * \param ksize_x kernel width + * \param kstride stride for each kernel + * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline UnPoolingExp::kDim> +unpool(const Exp &data_src, + const Exp &data_pooled, + const Exp &grad_pooled, + index_t ksize_y, index_t ksize_x, index_t kstride) { + return UnPoolingExp::kDim> + (data_src.self(), data_pooled.self(), grad_pooled.self(), + ksize_y, ksize_x, kstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const UnPoolingExp &e) + : data_src_(e.data_src_), data_pooled_(e.data_pooled_), + grad_pooled_(e.grad_pooled_), sshape_y_(e.shape_[srcdim - 2]), + pshape_y_(e.pshape_y_), pshape_x_(e.pshape_x_), + ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using std::min; + const index_t x = j; + const index_t y = i % sshape_y_; + const index_t c = i / sshape_y_; + const DType vsrc = data_src_.Eval(i, j); + const index_t py_min = + y < ksize_y_ ? 0 : (y - ksize_y_ + kstride_) / kstride_; + const index_t px_min = + x < ksize_x_ ? 0 : (x - ksize_x_ + kstride_) / kstride_; + const index_t py_max = min((y + kstride_) / kstride_, pshape_y_); + const index_t px_max = min((x + kstride_) / kstride_, pshape_x_); + + DType val = 0; + for (index_t py = py_min; py < py_max; ++py) { + for (index_t px = px_min; px < px_max; ++px) { + val += Reducer::PartialGrad(vsrc, + data_pooled_.Eval(c * pshape_y_ + py, px) * + grad_pooled_.Eval(c * pshape_y_ + py, px)); + } + } + return val; + } + + private: + Plan data_src_, data_pooled_, grad_pooled_; + const index_t sshape_y_, pshape_y_, pshape_x_; + const index_t ksize_y_, ksize_x_; + const index_t kstride_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ From 2e0adf3bf0e0aa13843bab392c7e56184f1c361f Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 18:39:22 -0800 Subject: [PATCH 030/147] checkin chpool --- mshadow/extension/spatial_pool.h | 42 ++++++++++++++++++------------ mshadow/extension/spatial_unpool.h | 2 +- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/mshadow/extension/spatial_pool.h b/mshadow/extension/spatial_pool.h index f1ce146d53aa..b87c03f6765d 100644 --- a/mshadow/extension/spatial_pool.h +++ b/mshadow/extension/spatial_pool.h @@ -1,11 +1,12 @@ /*! * Copyright (c) 2014 by Contributors - * \file pooling.h - * \brief support for pooling + * \file spatial_pool.h + * \brief support for spatial pooling * \author Tianqi Chen */ -#ifndef MSHADOW_EXTENSION_POOLING_H_ -#define MSHADOW_EXTENSION_POOLING_H_ +#ifndef MSHADOW_EXTENSION_SPATIAL_POOL_H_ +#define MSHADOW_EXTENSION_SPATIAL_POOL_H_ +#include #include "../extension.h" namespace mshadow { namespace expr { @@ -33,7 +34,8 @@ struct PoolingExp: /*! \brief source width shape[0] */ index_t src_width_; /*! \brief constructor */ - PoolingExp(const SrcExp &src, index_t ksize_y, index_t ksize_x, index_t kstride) + PoolingExp(const SrcExp &src, + index_t ksize_y, index_t ksize_x, index_t kstride) : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { Shape sshape = ShapeCheck::Check(src_); utils::Check(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y, @@ -41,21 +43,23 @@ struct PoolingExp: this->src_height_ = sshape[srcdim - 2]; this->src_width_ = sshape[srcdim - 1]; this->shape_ = sshape; - this->shape_[srcdim - 2] = (src_height_ - ksize_y) / kstride + 1; + this->shape_[srcdim - 2] = (src_height_ - ksize_y) / kstride + 1; this->shape_[srcdim - 1] = (src_width_ - ksize_x) / kstride + 1; } /*! \brief constructor, specify shape */ - PoolingExp(const SrcExp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) + PoolingExp(const SrcExp &src, Shape<2> pshape, + index_t ksize_y, index_t ksize_x, index_t kstride) : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { Shape sshape = ShapeCheck::Check(src_); - utils::Assert(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y, + utils::Assert(sshape[srcdim - 1] >= ksize_x && + sshape[srcdim - 2] >= ksize_y, "PoolingExp: kernel must be smaller than image"); this->src_height_ = sshape[srcdim - 2]; this->src_width_ = sshape[srcdim - 1]; this->shape_ = sshape; this->shape_[srcdim - 2] = pshape[0]; this->shape_[srcdim - 1] = pshape[1]; - } + } }; /*! * \brief pooling subregion results together @@ -71,7 +75,8 @@ struct PoolingExp: */ template inline PoolingExp::kDim> -pool(const Exp &src, index_t ksize_y, index_t ksize_x, index_t kstride) { +pool(const Exp &src, + index_t ksize_y, index_t ksize_x, index_t kstride) { TypeCheckPass::kDim >= 2> ::Error_Expression_Does_Not_Meet_Dimension_Req(); return PoolingExp::kDim> @@ -90,9 +95,11 @@ pool(const Exp &src, index_t ksize_y, index_t ksize_x, ind * \tparam DType the content data type * \tparam etype type of expression */ -template -inline PoolingExp::kDim> -pool(const Exp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) { +template +inline PoolingExp::kDim> +pool(const Exp &src, Shape<2> pshape, + index_t ksize_y, index_t ksize_x, index_t kstride) { TypeCheckPass::kDim >= 2> ::Error_Expression_Does_Not_Meet_Dimension_Req(); return PoolingExp::kDim> @@ -105,9 +112,9 @@ template struct Plan, DType> { public: explicit Plan(const PoolingExp &e) - : src_(MakePlan(e.src_)), + : src_(MakePlan(e.src_)), ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_), - src_height_(e.src_height_),src_width_(e.src_width_), + src_height_(e.src_height_), src_width_(e.src_width_), new_height_(e.shape_[srcdim - 2]) {} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { using std::min; @@ -118,7 +125,7 @@ struct Plan, DType> { const index_t x_start = px * kstride_; const index_t x_end = min(x_start + ksize_x_, src_width_); const index_t c = i / new_height_; - + DType res; Reducer::SetInitValue(res); for (index_t y = y_start; y < y_end; ++y) { for (index_t x = x_start; x < x_end; ++x) { @@ -127,6 +134,7 @@ struct Plan, DType> { } return res; } + private: Plan src_; const index_t ksize_y_, ksize_x_, kstride_; @@ -135,4 +143,4 @@ struct Plan, DType> { }; } // namespace expr } // namespace mshadow -#endif // MSHADOW_EXTENSION_POOLING_H_ +#endif // MSHADOW_EXTENSION_SPATIAL_POOL_H_ diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h index a75463c1a9fb..eac3faec713a 100644 --- a/mshadow/extension/spatial_unpool.h +++ b/mshadow/extension/spatial_unpool.h @@ -105,7 +105,7 @@ struct Plan, DType> { const index_t py_max = min((y + kstride_) / kstride_, pshape_y_); const index_t px_max = min((x + kstride_) / kstride_, pshape_x_); - DType val = 0; + DType val = static_cast(0); for (index_t py = py_min; py < py_max; ++py) { for (index_t px = px_min; px < px_max; ++px) { val += Reducer::PartialGrad(vsrc, From a3a796ecb992058dab1027f1743842bd82577232 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 18:50:19 -0800 Subject: [PATCH 031/147] add pad --- mshadow/extension.h | 4 + mshadow/extension/broadcast.h | 1 - mshadow/extension/channel_pool.h | 6 +- mshadow/extension/pad.h | 111 ++++++++++ mshadow/extension/reduceto1d.h | 4 +- mshadow/extension/spatial_unpool.h | 10 +- mshadow/tensor.h | 15 ++ mshadow/tensor_expr_ext.h | 338 +---------------------------- 8 files changed, 144 insertions(+), 345 deletions(-) create mode 100644 mshadow/extension/pad.h diff --git a/mshadow/extension.h b/mshadow/extension.h index 0e9089ccc6ba..25092c6b2567 100644 --- a/mshadow/extension.h +++ b/mshadow/extension.h @@ -14,5 +14,9 @@ #include "./extension/reshape.h" #include "./extension/swapaxis.h" #include "./extension/reduceto1d.h" +#include "./extension/spatial_pool.h" +#include "./extension/spatial_unpool.h" +#include "./extension/channel_pool.h" +#include "./extension/pad.h" #endif diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h index 9ff3bb88b0d7..a163a5011212 100644 --- a/mshadow/extension/broadcast.h +++ b/mshadow/extension/broadcast.h @@ -69,7 +69,6 @@ repmat(const expr::Exp &src, index_t nrow) { //---------------------- // Execution plan //---------------------- -/*! \brief execution plan of Broadcast1DExp */ template struct Plan, DType> { diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h index f8c3e46f95eb..6b0b32553332 100644 --- a/mshadow/extension/channel_pool.h +++ b/mshadow/extension/channel_pool.h @@ -1,7 +1,7 @@ /*! * Copyright (c) 2014 by Contributors - * \file broadcast.h - * \brief support for broadcast and repmat + * \file channel_pool.h + * \brief support for chpool * \author Tianqi Chen */ #ifndef MSHADOW_EXTENSION_CHANNEL_POOL_H_ @@ -15,6 +15,7 @@ namespace expr { * used to implement local response normalization * \tparam Reducer reduction method during pooling * \tparam SrcExp source expression to be pooled from + * \tparam DType the type of elements * \tparam srcdim dimension of src */ template @@ -43,6 +44,7 @@ struct ChannelPoolingExp: * \return expression of pooled result * \tparam Reducer reducer type * \tparam SrcExp source expression + * \tparam DType the type of elements * \tparam etype type of expression */ template diff --git a/mshadow/extension/pad.h b/mshadow/extension/pad.h new file mode 100644 index 000000000000..6622a022acc8 --- /dev/null +++ b/mshadow/extension/pad.h @@ -0,0 +1,111 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file pad.h + * \brief support for pad + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_PAD_H_ +#define MSHADOW_EXTENSION_PAD_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief padding expression, pad a image with zeros + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct PaddingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief pad size in y */ + index_t pad_y_; + /*! \brief pad size in x */ + index_t pad_x_; + /*! \brief source tensor height */ + index_t src_height_; + /*! \brief source tensor width */ + index_t src_width_; + /*! \brief constructor */ + PaddingExp(const SrcExp &src, index_t pad_y, index_t pad_x) + : src_(src), pad_y_(pad_y), pad_x_(pad_x) { + this->shape_ = ShapeCheck::Check(src_); + src_height_ = this->shape_[srcdim - 2]; + src_width_ = this->shape_[srcdim - 1]; + this->shape_[srcdim - 2] += pad_y * 2; // height + this->shape_[srcdim - 1] += pad_x * 2; // width + } +}; +/*! + * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] + * \param src original image batches + * \param pad padding size + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PaddingExp::kDim> +pad(const Exp &src, index_t pad) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PaddingExp::kDim>(src.self(), pad, pad); +} +/*! + * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] + * \param src original image batches + * \param pad_y padding size in y + * \param pad_x padding size in x + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PaddingExp::kDim> +pad(const Exp &src, index_t pad_y, index_t pad_x) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PaddingExp::kDim> + (src.self(), pad_y, pad_x); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const PaddingExp &e) + : src_(MakePlan(e.src_)), + pad_y_(e.pad_y_), pad_x_(e.pad_x_), + new_height_(e.shape_[srcdim - 2]), + src_height_(e.src_height_), src_width_(e.src_width_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t x = j; + const index_t y = i % new_height_; + const index_t c = i / new_height_; + if (y < pad_y_ || x < pad_x_) return static_cast(0); + const index_t h = y - pad_y_; + const index_t w = x - pad_x_; + if (h < src_height_ && w < src_width_) { + return src_.Eval(c * src_height_ + h, w); + } else { + return static_cast(0); + } + } + + private: + Plan src_; + const index_t pad_y_; + const index_t pad_x_; + const index_t new_height_; + const index_t src_height_; + const index_t src_width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_PAD_H_ diff --git a/mshadow/extension/reduceto1d.h b/mshadow/extension/reduceto1d.h index b52157451ccd..cfab77cb6659 100644 --- a/mshadow/extension/reduceto1d.h +++ b/mshadow/extension/reduceto1d.h @@ -1,7 +1,7 @@ /*! * Copyright (c) 2014 by Contributors - * \file broadcast.h - * \brief support for broadcast and repmat + * \file reduceto1d.h + * \brief support for sum_rows and sumall_except_dim * \author Tianqi Chen */ #ifndef MSHADOW_EXTENSION_REDUCETO1D_H_ diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h index eac3faec713a..092819c32697 100644 --- a/mshadow/extension/spatial_unpool.h +++ b/mshadow/extension/spatial_unpool.h @@ -1,7 +1,7 @@ /*! * Copyright (c) 2014 by Contributors - * \file broadcast.h - * \brief support for broadcast and repmat + * \file spatial_unpool.h + * \brief support for unpool * \author Tianqi Chen */ #ifndef MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ @@ -12,8 +12,10 @@ namespace mshadow { namespace expr { /*! * \brief unpooling expr reverse operation of pooling, used to pass gradient back - * \tparam Reducer specifies reduction operation during pooling - * \tparam Device which device it lies + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the content data type + * \tparam srcdim dimension of src */ template struct UnPoolingExp: diff --git a/mshadow/tensor.h b/mshadow/tensor.h index fa9ebf4fb36e..8ed2baf41f18 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -127,6 +127,21 @@ v * \return subshape } return s; } + /*! + * \brief slice the shape from start to end + * \tparam dimstart start dimension + * \tparam dimend end dimension + * \return the sliced shape + */ + template + MSHADOW_XINLINE Shape Slice(void) const { + Shape s; + #pragma unroll + for (int i = dimstart; i < dimend; ++i) { + s[i - dimstart] = this->shape_[i]; + } + return s; + } }; // Shape //------------------------------------------------ // useful construction functions to generate shape diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h index b47fced65afa..f4fafd9142fa 100644 --- a/mshadow/tensor_expr_ext.h +++ b/mshadow/tensor_expr_ext.h @@ -11,109 +11,6 @@ namespace mshadow{ namespace expr{ - /*! - * \brief pooling expression, do reduction over local patches of a image - * \tparam Reducer reduction method during pooling - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct PoolingExp: public MakeTensorExp< PoolingExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief kernel size in height */ - index_t ksize_y_; - /*! \brief kernel size in width */ - index_t ksize_x_; - /*! \brief kernel stride */ - index_t kstride_; - /*! \brief source height shape[1] */ - index_t src_height_; - /*! \brief source width shape[0] */ - index_t src_width_; - /*! \brief constructor */ - PoolingExp(const SrcExp &src, index_t ksize_y, index_t ksize_x, index_t kstride) - : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { - Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check(src_); - utils::Assert(sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image"); - this->src_height_ = sshape[1]; - this->src_width_ = sshape[0]; - this->shape_ = sshape; - this->shape_[1] = (src_height_ - ksize_y) / kstride + 1; - this->shape_[0] = (src_width_ - ksize_x) / kstride + 1; - } - /*! \brief constructor, specify shape */ - PoolingExp(const SrcExp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) - : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { - Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check(src_); - utils::Assert(sshape[0] >= ksize_x && sshape[1] >= ksize_y, "pool: kernel must be smaller than image"); - this->src_height_ = sshape[1]; - this->src_width_ = sshape[0]; - this->shape_ = sshape; - this->shape_[1] = pshape[1]; - this->shape_[0] = pshape[0]; - } - }; - - /*! - * \brief unpooling expr reverse operation of pooling, used to pass gradient back - * \tparam Reducer specifies reduction operation during pooling - * \tparam Device which device it lies - */ - template - struct UnPoolingExp: public MakeTensorExp< UnPoolingExp, Tensor, 4> { - /*! \brief source input, corresponds to src in pooling */ - const Tensor& data_src_; - /*! \brief result of pooled data, corresponds to result of pooling */ - const Tensor& data_pooled_; - /*! \brief gradient data of pooled part, to be propgate down */ - const Tensor& grad_pooled_; - /*! \brief kernel size in height */ - index_t ksize_y_; - /*! \brief kernel size in width */ - index_t ksize_x_; - /*! \brief kernel stride */ - index_t kstride_; - /*! \brief constructor */ - UnPoolingExp(const Tensor &data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride) - : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled), - ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { - utils::Assert(grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch"); - utils::Assert(grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch"); - utils::Assert(grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch"); - this->shape_ = data_src_.shape; - } - }; - - /*! - * \brief padding expression, pad a image with zeros - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct PaddingExp : public MakeTensorExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief pad size in y */ - index_t pad_y_; - /*! \brief pad size in x */ - index_t pad_x_; - /*! \brief source tensor height */ - index_t src_height_; - /*! \brief source tensor width */ - index_t src_width_; - /*! \brief constructor */ - PaddingExp(const SrcExp &src, index_t pad_y, index_t pad_x) - : src_(src), pad_y_(pad_y), pad_x_(pad_x) { - this->shape_ = ShapeCheck::Check(src_); - src_height_ = this->shape_[1]; - src_width_ = this->shape_[0]; - this->shape_[1] += pad_y * 2; // height - this->shape_[0] += pad_x * 2; // width - } - }; - /*! * \brief crop expression, cut off the boundary region, reverse operation of padding * \tparam SrcExp source expression to be pooled from @@ -169,25 +66,6 @@ namespace mshadow{ } }; - /*! - * \brief channel pooling expression, do reduction over (local nearby) channels, used to implement local response normalization - * \tparam Reducer reduction method during pooling - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct ChannelPoolingExp: public MakeTensorExp< ChannelPoolingExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief neighbor size */ - index_t nsize_; - /*! \brief constructor */ - ChannelPoolingExp(const SrcExp &src, index_t nsize): src_(src), nsize_(nsize){ - utils::Assert(nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric"); - this->shape_ = ShapeCheck::Check(src_); - utils::Assert(this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels"); - } - }; }; // namespace expr @@ -195,88 +73,6 @@ namespace mshadow{ namespace expr{ - /*! - * \brief pooling subregion results together - * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width - * \param ksize_y kernel size in height - * \param ksize_x kernel size in width - * \param kstride stride for each kernel - * \return expression of pooled result - * \tparam Reducer reducer type - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PoolingExp::kDim > pool(const Exp &src, index_t ksize_y, index_t ksize_x, index_t kstride) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PoolingExp::kDim >(src.self(), ksize_y, ksize_x, kstride); - } - - /*! - * \brief same as pool, except the output shape is specified by pshape - * \param src source image - * \param pshape ouput shape - * \param ksize_y kernel size in y - * \param ksize_x kernel size in x - * \param kstride stride for each kernel - * \return expression of pooled result - * \tparam Reducer reducer type - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PoolingExp::kDim > pool(const Exp &src, Shape<2> pshape, index_t ksize_y, index_t ksize_x, index_t kstride) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PoolingExp::kDim >(src.self(), pshape, ksize_y, ksize_x, kstride); - } - - /*! - * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling, same as unpooling, but allows unequal size of kernel - * \param data_src source input, corresponds to src in pooling - * \param data_pooled result of pooled data, corresponds to result of pooling - * \param grad_pooled gradient data of pooled part, to be propgate down - * \param ksize_y kernel height - * \param ksize_x kernel width - * \param kstride stride for each kernel - * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient - * \tparam Reducer reducer type - * \tparam Device device where data lies - */ - template - inline UnPoolingExp unpool(const Tensor&data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize_y, index_t ksize_x, index_t kstride) { - return UnPoolingExp(data_src, data_pooled, grad_pooled, ksize_y, ksize_x, kstride); - } - - /*! - * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] - * \param src original image batches - * \param pad padding size - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PaddingExp::kDim> pad(const Exp &src, index_t pad) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PaddingExp::kDim>(src.self(), pad, pad); - } - - /*! - * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] - * \param src original image batches - * \param pad_y padding size in y - * \param pad_x padding size in x - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PaddingExp::kDim> pad(const Exp &src, index_t pad_y, index_t pad_x) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PaddingExp::kDim>(src.self(), pad_y, pad_x); - } - /*! * \brief revserse operationg of padding, cut off boundaries, crop output from center of input @@ -320,20 +116,7 @@ namespace mshadow{ return MirroringExp::kDim>(src.self()); } - /*! - * \brief channel pooling, do reduction over (local nearby) channels, used to implement local response normalization - * \param src source data - * \param nsize neighbor size - * \return expression of pooled result - * \tparam Reducer reducer type - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline ChannelPoolingExp::kDim > chpool(const Exp &src, index_t nsize) { - TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return ChannelPoolingExp::kDim >(src.self(),nsize); - } + }; // namespace expr @@ -363,101 +146,9 @@ namespace mshadow{ }; namespace expr{ - template - struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > { - public: - Plan(const PoolingExp &e) - : src_(MakePlan(e.src_)), ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), - kstride_(e.kstride_), - src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) { - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - using namespace std; - const index_t py = i % new_height_; - const index_t y_start = py * kstride_; - const index_t y_end = min(y_start + ksize_y_, src_height_); - const index_t px = j; - const index_t x_start = px * kstride_; - const index_t x_end = min(x_start + ksize_x_, src_width_); - const index_t c = i / new_height_; - - real_t res = Reducer::kInitV; - for (index_t y = y_start; y < y_end; ++y) { - for (index_t x = x_start; x < x_end; ++x) { - Reducer::Reduce(res, src_.Eval(c*src_height_+y, x)); - } - } - return res; - } - private: - Plan src_; - const index_t ksize_y_, ksize_x_, kstride_; - const index_t src_height_, src_width_; - const index_t new_height_; - }; - - template - struct Plan > { - public: - Plan(const UnPoolingExp &e) - : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_), - ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_) {} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - using namespace std; - const index_t x = j; - const index_t y = i % data_src_.shape[1]; - const index_t c = i / data_src_.shape[1]; - const real_t vsrc = data_src_[0][c][y][x]; - - const index_t py_min = y < ksize_y_ ? 0 : (y-ksize_y_+kstride_)/kstride_; - const index_t px_min = x < ksize_x_ ? 0 : (x-ksize_x_+kstride_)/kstride_; - const index_t py_max = min((y+kstride_)/kstride_, data_pooled_.shape[1]); - const index_t px_max = min((x+kstride_)/kstride_, data_pooled_.shape[0]); - - real_t val = 0; - for(index_t py = py_min; py < py_max; ++py){ - for(index_t px = px_min; px < px_max; ++px){ - val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px]; - } - } - return val; - } - private: - Tensor data_src_, data_pooled_, grad_pooled_; - const index_t ksize_y_, ksize_x_; - const index_t kstride_; - }; }; // namespace expr namespace expr{ - template - struct Plan< PaddingExp > { - public: - Plan(const PaddingExp &e) - : src_(MakePlan(e.src_)), pad_y_(e.pad_y_), pad_x_(e.pad_x_), - new_height_(e.shape_[1]), - src_height_(e.src_height_), src_width_(e.src_width_) {} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - const index_t x = j; - const index_t y = i % new_height_; - const index_t c = i / new_height_; - if (y < pad_y_ || x < pad_x_) return 0.0f; - const index_t h = y - pad_y_; - const index_t w = x - pad_x_; - if (h < src_height_ && w < src_width_) { - return src_.Eval(c * src_height_ + h, w); - } else { - return 0.0f; - } - } - private: - Plan src_; - const index_t pad_y_; - const index_t pad_x_; - const index_t new_height_; - const index_t src_height_; - const index_t src_width_; - }; template struct Plan > { @@ -495,32 +186,7 @@ namespace mshadow{ }; // namespace expr namespace expr{ - template - struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > { - public: - Plan(const ChannelPoolingExp &e) - : src_(MakePlan(e.src_)), channel_(e.shape_[2]), - height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){ - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - using namespace std; - const index_t y = i % height_; - i /= height_; - const index_t c = i % channel_; - const index_t n = i / channel_; - const index_t x = j; - const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; - const index_t cend = min(c + hnsize_ + 1, channel_); - real_t res = Reducer::kInitV; - for(index_t cc = cstart; cc < cend; ++ cc){ - Reducer::Reduce(res, src_.Eval((n*channel_+cc)*height_ + y, x)); - } - return res; - } - private: - Plan src_; - const index_t channel_, height_, width_, hnsize_; - }; + }; }; // namespace mshadow From 5455e385e74e44f557d282c2e05a23859e757efb Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 19:30:47 -0800 Subject: [PATCH 032/147] checkin extensions --- example/basic.cpp | 9 +- mshadow/extension.h | 2 + mshadow/extension/mirror.h | 62 ++++++++++ mshadow/extension/reshape.h | 2 +- mshadow/extension/spatial_pool.h | 6 +- mshadow/sse-inl.h | 2 +- mshadow/tensor_expr_ext.h | 196 ------------------------------- mshadow/utils.h | 2 +- 8 files changed, 75 insertions(+), 206 deletions(-) create mode 100644 mshadow/extension/mirror.h delete mode 100644 mshadow/tensor_expr_ext.h diff --git a/example/basic.cpp b/example/basic.cpp index 2e7869b0d29a..79194e39a157 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -16,17 +16,18 @@ int main(void) { Tensor mat = ts[0]; // Tensor object is only a handle, assignment means they have same data content Tensor mat2= NewTensor(Shape1(2), 0.0f); + Tensor ts1= NewTensor(ts.shape_, 0.0f); mat2[1] = 10; // shape of matrix, note shape order is different from numpy // shape[i] indicate the shape of i-th dimension printf("%u X %u matrix, stride=%u\n", mat.size(0), mat.size(1), mat.stride_); - - + // assign some values mat[0][1] = 1.0f; mat[1][0] = 2.0f; // elementwise operations - ts = broadcast<0>(mat2, ts.shape_); - + + //ts = broadcast<0>(mat2, ts.shape_); + mat2 = sumall_except_dim<0>(mat); // print out matrix, note: mat2 and mat1 are handles(pointers) for (index_t c = 0; c < ts.size(0); ++c) { for (index_t i = 0; i < mat.size(0); ++i) { diff --git a/mshadow/extension.h b/mshadow/extension.h index 25092c6b2567..f9e9badeb82b 100644 --- a/mshadow/extension.h +++ b/mshadow/extension.h @@ -18,5 +18,7 @@ #include "./extension/spatial_unpool.h" #include "./extension/channel_pool.h" #include "./extension/pad.h" +#include "./extension/crop.h" +#include "./extension/mirror.h" #endif diff --git a/mshadow/extension/mirror.h b/mshadow/extension/mirror.h new file mode 100644 index 000000000000..9e9edc9b6f70 --- /dev/null +++ b/mshadow/extension/mirror.h @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file mirror.h + * \brief support for mirror + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_MIRROR_H_ +#define MSHADOW_EXTENSION_MIRROR_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief mirror expression, mirror a image in width + * \tparam SrcExp source expression to be mirrored + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct MirroringExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief constructor */ + explicit MirroringExp(const SrcExp &src) : src_(src) { + this->shape_ = ShapeCheck::Check(src_); + } +}; +/*! + * \brief mirroring expression, mirror images in width + * \param src original image batches + * \return expression corresponding to mirrored result + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline MirroringExp::kDim> +mirror(const Exp &src) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return MirroringExp::kDim>(src.self()); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const MirroringExp &e) + : src_(MakePlan(e.src_)), width_(e.shape_[srcdim - 1]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + return src_.Eval(i, width_ - j - 1); + } + + private: + Plan src_; + const index_t width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_MIRROR_H_ diff --git a/mshadow/extension/reshape.h b/mshadow/extension/reshape.h index a96bc190bf13..738e98f0e2c9 100644 --- a/mshadow/extension/reshape.h +++ b/mshadow/extension/reshape.h @@ -29,7 +29,7 @@ struct ReshapeExp: ReshapeExp(const SrcExp &src, Shape shape) : src_(src) { Shape ishape = ShapeCheck::Check(src_); - utils::Assert(ishape.Size() == shape.Size(), "reshape size must match"); + utils::Check(ishape.Size() == shape.Size(), "reshape size must match"); ishapex_ = ishape[dimsrc - 1]; this->shape_ = shape; } diff --git a/mshadow/extension/spatial_pool.h b/mshadow/extension/spatial_pool.h index b87c03f6765d..aa0435fdab0f 100644 --- a/mshadow/extension/spatial_pool.h +++ b/mshadow/extension/spatial_pool.h @@ -51,9 +51,9 @@ struct PoolingExp: index_t ksize_y, index_t ksize_x, index_t kstride) : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { Shape sshape = ShapeCheck::Check(src_); - utils::Assert(sshape[srcdim - 1] >= ksize_x && - sshape[srcdim - 2] >= ksize_y, - "PoolingExp: kernel must be smaller than image"); + utils::Check(sshape[srcdim - 1] >= ksize_x && + sshape[srcdim - 2] >= ksize_y, + "PoolingExp: kernel must be smaller than image"); this->src_height_ = sshape[srcdim - 2]; this->src_width_ = sshape[srcdim - 1]; this->shape_ = sshape; diff --git a/mshadow/sse-inl.h b/mshadow/sse-inl.h index ea5392291fd1..9281c2a7d487 100644 --- a/mshadow/sse-inl.h +++ b/mshadow/sse-inl.h @@ -36,7 +36,7 @@ inline void* AlignedMallocPitch(size_t *out_pitch, void *res = memalign(16, pitch * num_line); #endif #endif - utils::Assert(res != NULL, "AlignedMallocPitch failed"); + utils::Check(res != NULL, "AlignedMallocPitch failed"); return res; } /*! diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h deleted file mode 100644 index f4fafd9142fa..000000000000 --- a/mshadow/tensor_expr_ext.h +++ /dev/null @@ -1,196 +0,0 @@ -#ifndef MSHADOW_TENSOR_EXPR_EXT_H -#define MSHADOW_TENSOR_EXPR_EXT_H -/*! - * \file tensor_expr_ext.h - * \brief some extension of expressions, used to support something beyond elementwise op - * \author Tianqi Chen, Bing Xu - */ -#include "tensor_expr_engine-inl.hpp" -namespace mshadow{ - // Declaration of expressions goes here - namespace expr{ - - - /*! - * \brief crop expression, cut off the boundary region, reverse operation of padding - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct CroppingExp : public MakeTensorExp< CroppingExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief pad height */ - index_t pad_height_; - /*! \brief pad height */ - index_t pad_width_; - /*! \brief src height */ - index_t src_height_; - /*! \brief constructor */ - CroppingExp(const SrcExp &src, Shape<2> cshape): src_(src) { - this->shape_ = ShapeCheck::Check(src_); - utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); - utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); - pad_height_ = (this->shape_[1] - cshape[1]) / 2; - pad_width_ = (this->shape_[0] - cshape[0]) / 2; - src_height_ = this->shape_[1]; - this->shape_[1] = cshape[1]; // width - this->shape_[0] = cshape[0]; // height - } - /*! \brief constructor */ - CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width ) - : src_(src), pad_height_(start_height), pad_width_(start_width) { - this->shape_ = ShapeCheck::Check(src_); - utils::Assert(this->shape_[1] >= cshape[1]+start_height, "CroppingExp: height requirement not met"); - utils::Assert(this->shape_[0] >= cshape[0]+start_width, "CroppingExp: width requirement not met"); - src_height_ = this->shape_[1]; - this->shape_[1] = cshape[1]; // width - this->shape_[0] = cshape[0]; // height - } - - }; // struct CroppingExp - - - /*! - * \brief mirror expression, mirror a image in width - * \tparam SrcExp source expression to be mirrored - * \tparam srcdim dimension of src - */ - template - struct MirroringExp : public MakeTensorExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief constructor */ - MirroringExp(const SrcExp &src): src_(src) { - this->shape_ = ShapeCheck::Check(src_); - } - }; - - }; // namespace expr - - - // Declaration of all functions go here - namespace expr{ - - - - /*! - * \brief revserse operationg of padding, cut off boundaries, crop output from center of input - * \param src original image batches - * \param oshape output shape to be cropped - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline CroppingExp::kDim> crop(const Exp &src, Shape<2> oshape) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return CroppingExp::kDim>(src.self(), oshape); - } - /*! - * \brief same as crop, but can specify starting position to do cropping - * \param src original image batches - * \param oshape output shape to be cropped - * \param start_height start height position to do cropping - * \param start_width start width position to do cropping - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline CroppingExp::kDim> crop(const Exp &src, Shape<2> oshape, index_t start_height, index_t start_width) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return CroppingExp::kDim>(src.self(), oshape, start_height, start_width); - } - - /*! - * \brief mirroring expression, mirror images in width - * \param src original image batches - * \return expression corresponding to mirrored result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline MirroringExp::kDim> mirror(const Exp &src) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return MirroringExp::kDim>(src.self()); - } - - - - - }; // namespace expr -}; // namespace mshadow - -// ================================================== -// implementations afterwards, -// no need to read if only use the functions -// -------------------------------------------------- -namespace mshadow{ - namespace expr{ - }; // namespace expr - - namespace expr{ - - }; // namespace expr - - namespace expr{ - - - }; - namespace expr{ - }; - - namespace expr{ - - }; - - namespace expr{ - }; // namespace expr - - namespace expr{ - - template - struct Plan > { - public: - Plan(const CroppingExp &e) - : src_(MakePlan(e.src_)), pad_height_(e.pad_height_),pad_width_(e.pad_width_), - new_height_(e.shape_[1]), src_height_(e.src_height_) {} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - const index_t x = j; - const index_t y = i % new_height_; - const index_t c = i / new_height_; - const index_t h = y + pad_height_; - const index_t w = x + pad_width_; - return src_.Eval(c * src_height_ + h, w); - } - private: - Plan src_; - const index_t pad_height_, pad_width_; - const index_t new_height_; - const index_t src_height_; - }; - - template - struct Plan< MirroringExp > { - public: - Plan(const MirroringExp &e) - : src_(MakePlan(e.src_)), width_(e.shape_[0]){} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - return src_.Eval(i, width_ - j - 1); - } - private: - Plan src_; - const index_t width_; - }; - }; // namespace expr - - namespace expr{ - - }; -}; // namespace mshadow - - - -#endif - diff --git a/mshadow/utils.h b/mshadow/utils.h index 6003f5562814..3da31a986c56 100644 --- a/mshadow/utils.h +++ b/mshadow/utils.h @@ -42,7 +42,7 @@ void HandlePrint(const char *msg); #endif /*! \brief assert an condition is true, use this to handle debug information */ -inline void Assert(bool exp, const char *fmt, ...) { +inline void AssertX(bool exp, const char *fmt, ...) { if (!exp) { std::string msg(kPrintBuffer, '\0'); va_list args; From b0b6847fa6c12da0ff1af78e7e7e3ffc15207131 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 19:31:53 -0800 Subject: [PATCH 033/147] checkin crop --- mshadow/extension/crop.h | 121 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 mshadow/extension/crop.h diff --git a/mshadow/extension/crop.h b/mshadow/extension/crop.h new file mode 100644 index 000000000000..d740d7bb18c9 --- /dev/null +++ b/mshadow/extension/crop.h @@ -0,0 +1,121 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file crop.h + * \brief support for crop + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_CROP_H_ +#define MSHADOW_EXTENSION_CROP_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief crop expression, cut off the boundary region, reverse operation of padding + * \tparam SrcExp source expression to be pooled from + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct CroppingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief pad height */ + index_t pad_height_; + /*! \brief pad height */ + index_t pad_width_; + /*! \brief src height */ + index_t src_height_; + /*! \brief constructor */ + explicit CroppingExp(const SrcExp &src, Shape<2> cshape) + : src_(src) { + this->shape_ = ShapeCheck::Check(src_); + utils::Check(this->shape_[srcdim - 2] >= cshape[0], + "CroppingExp: height requirement not met"); + utils::Check(this->shape_[srcdim - 1] >= cshape[1], + "CroppingExp: width requirement not met"); + pad_height_ = (this->shape_[srcdim - 2] - cshape[0]) / 2; + pad_width_ = (this->shape_[srcdim - 1] - cshape[1]) / 2; + src_height_ = this->shape_[srcdim - 2]; + this->shape_[srcdim - 2] = cshape[0]; // height + this->shape_[srcdim - 1] = cshape[1]; // width + } + /*! \brief constructor */ + explicit CroppingExp(const SrcExp &src, Shape<2> cshape, + index_t start_height, index_t start_width) + : src_(src), pad_height_(start_height), pad_width_(start_width) { + this->shape_ = ShapeCheck::Check(src_); + utils::Check(this->shape_[srcdim - 2] >= cshape[0] + start_height, + "CroppingExp: height requirement not met"); + utils::Check(this->shape_[srcdim - 1] >= cshape[1] + start_width, + "CroppingExp: width requirement not met"); + src_height_ = this->shape_[srcdim - 2]; + this->shape_[srcdim - 2] = cshape[0]; // height + this->shape_[srcdim - 1] = cshape[1]; // width + } +}; // struct CroppingExp +/*! + * \brief revserse operationg of padding, cut off boundaries, + * crop output from center of input + * \param src original image batches + * \param oshape output shape to be cropped + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline CroppingExp::kDim> +crop(const Exp &src, Shape<2> oshape) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return CroppingExp::kDim>(src.self(), oshape); +} +/*! + * \brief same as crop, but can specify starting position to do cropping + * \param src original image batches + * \param oshape output shape to be cropped + * \param start_height start height position to do cropping + * \param start_width start width position to do cropping + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline CroppingExp::kDim> +crop(const Exp &src, Shape<2> oshape, + index_t start_height, index_t start_width) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return CroppingExp::kDim> + (src.self(), oshape, start_height, start_width); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const CroppingExp &e) + : src_(MakePlan(e.src_)), + pad_height_(e.pad_height_), pad_width_(e.pad_width_), + new_height_(e.shape_[srcdim - 2]), src_height_(e.src_height_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t x = j; + const index_t y = i % new_height_; + const index_t c = i / new_height_; + const index_t h = y + pad_height_; + const index_t w = x + pad_width_; + return src_.Eval(c * src_height_ + h, w); + } + private: + Plan src_; + const index_t pad_height_, pad_width_; + const index_t new_height_; + const index_t src_height_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CROP_H_ From 225fed114c0b00899a5338b4620c1a2b31da7a5e Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 22:15:57 -0800 Subject: [PATCH 034/147] one pass over random --- mshadow/random.h | 338 +++++++++++++++++++++++++++++++++++++ mshadow/tensor.h | 1 + mshadow/tensor_container.h | 10 +- mshadow/tensor_random.h | 299 -------------------------------- 4 files changed, 344 insertions(+), 304 deletions(-) create mode 100644 mshadow/random.h delete mode 100644 mshadow/tensor_random.h diff --git a/mshadow/random.h b/mshadow/random.h new file mode 100644 index 000000000000..c9875ded36d2 --- /dev/null +++ b/mshadow/random.h @@ -0,0 +1,338 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file random.h + * \brief Random inline functions for tensor. + * \author Bing Xu, Tianqi Chen + * Based on curand|MKL|stdlib + */ +#ifndef MSHADOW_RANDOM_H_ +#define MSHADOW_RANDOM_H_ +#include +#include "./tensor.h" +#include "./tensor_container.h" + +namespace mshadow { +/*! + * \brief random number generator + * \tparam Device the device of random number generator + * \tparam DType the target data type of random number can be float for double + */ +template +class Random {}; + +/*! \brief CPU random number generator */ +template +class Random { + public: + /*! + * \brief constructor of random engine + * \param seed random number seed + */ + explicit Random(int seed) { + this->Seed(seed); + buffer_.Resize(Shape1(kRandBufferSize)); + } + ~Random(void) { +#if MSHADOW_USE_MKL + vslDeleteStream(&vStream_); +#endif + } + /*! + * \brief seed random number generator using this seed + * \param seed seed of prng + */ + inline void Seed(int seed) { +#if MSHADOW_USE_MKL + int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed); + utils::Check(status == VSL_STATUS_OK, + "MKL VSL Random engine failed to be initialized.\n"); +#else + this->rseed_ = static_cast(seed); +#endif + } + /*! + * \brief generate data from uniform [a,b) + * \param dst destination + * \param a lower bound of uniform + * \param b upper bound of uniform + * \tparam dim dimension of tensor + */ + template + inline void SampleUniform(Tensor *dst, + DType a = 0.0f, DType b = 1.0f) { + Tensor mat = dst->FlatTo2D(); + for (index_t i = 0; i < mat.size(0); ++i) { + this->GenUniform(mat[i].dptr_, mat.size(1), a, b); + } + } + /*! + * \brief generate data from standard gaussian + * \param dst destination + * \param mu mean variable + * \param sigma standard deviation + * \tparam dim dimension of tensor + */ + template + inline void SampleGaussian(Tensor *dst, + DType mu = 0.0f, DType sigma = 1.0f) { + if (sigma <= 0.0f) { + *dst = mu; return; + } + Tensor mat = dst->FlatTo2D(); + for (index_t i = 0; i < mat.size(0); ++i) { + this->GenGaussian(mat[i].dptr_, mat.size(1), mu, sigma); + } + } + /*! + * \brief return a temporal expression storing standard gaussian random variables + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + gaussian(Shape shape) { + buffer_.Resize(Shape1(shape.Size())); + this->SampleGaussian(&buffer_, 0.0f, 1.0f); + return expr::reshape(buffer_, shape); + } + /*! + * \brief return a temporal expression storing standard uniform [0,1) + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = uniform(s1) * uniform(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + uniform(Shape shape) { + buffer_.Resize(Shape1(shape.Size())); + this->SampleUniform(&buffer_, 0.0f, 1.0f); + return expr::reshape(buffer_, shape); + } + + private: +#if MSHADOW_USE_MKL + /*! \brief stream used by MKL VSL */ + VSLStreamStatePtr vStream_; + // generate uniform distribution + inline void GenUniform(float *dptr, index_t size, float a, float b) { + int status = vsRngUniform(0, vStream_, size, dptr, a, b); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } + inline void GenUniform(double *dptr, index_t size, double a, double b) { + int status = vdRngUniform(0, vStream_, size, dptr, a, b); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } + inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) { + int status = vsRngGaussian(0, vStream_, size, dptr, mu, sigma); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } + inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) { + int status = vdRngGaussian(0, vStream_, size, dptr, mu, sigma); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } +#else + /*! \brief random number seed used by PRNG*/ + unsigned rseed_; + // functions + inline void GenUniform(float *dptr, index_t size, float a, float b) { + for (index_t j = 0; j < size; ++j) { + dptr[j] = static_cast(RandNext()) * (b - a) + a; + } + } + inline void GenUniform(double *dptr, index_t size, double a, double b) { + for (index_t j = 0; j < size; ++j) { + dptr[j] = static_cast(RandNext()) * (b - a) + a; + } + } + inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) { + this->GenGaussianX(dptr, size, mu, sigma); + } + inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) { + this->GenGaussianX(dptr, size, mu, sigma); + } + inline void GenGaussianX(DType *dptr, index_t size, DType mu, DType sigma) { + DType g1 = 0.0f, g2 = 0.0f; + for (index_t j = 0; j < size; ++j) { + if ((j & 1) == 0) { + this->SampleNormal2D(&g1, &g2); + dptr[j] = mu + g1 * sigma; + } else { + dptr[j] = mu + g2 * sigma; + } + } + } + /*! \brief get next random number from rand */ + inline DType RandNext(void) { + return static_cast(rand_r(&rseed_)) / + (static_cast(RAND_MAX) + 1.0f); + } + /*! \brief return a real numer uniform in (0,1) */ + inline DType RandNext2(void) { + return (static_cast(rand_r(&rseed_)) + 1.0f) / + (static_cast(RAND_MAX) + 2.0f); + } + /*! + * \brief sample iid xx,yy ~N(0,1) + * \param xx first gaussian output + * \param yy second gaussian output + */ + inline void SampleNormal2D(DType *xx_, DType *yy_) { + DType &xx = *xx_, &yy = *yy_; + DType x, y, s; + do { + x = 2.0f * RandNext2() - 1.0f; + y = 2.0f * RandNext2() - 1.0f; + s = x * x + y * y; + } while (s >= 1.0f || s == 0.0f); + DType t = std::sqrt(-2.0f * std::log(s) / s); + xx = x * t; yy = y * t; + } +#endif + /*! \brief temporal space used to store random numbers */ + TensorContainer buffer_; +}; // class Random +// only allow GPU PRNG in CUDACC +#ifdef __CUDACC__ +/*! \brief GPU random number generator */ +template +class Random { + public: + /*! + * \brief constructor of random engine + * \param seed random number seed + */ + Random(int seed) { + curandStatus_t status; + status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT); + utils::Check(status == CURAND_STATUS_SUCCESS, + "Can not create CURAND Generator"); + this->Seed(seed); + buffer_.Resize(Shape1(kRandBufferSize)); + } + + ~Random(void) { + curandStatus_t status; + status = curandDestroyGenerator(gen_); + utils::Check(status == CURAND_STATUS_SUCCESS, + "Destory CURAND Gen failed"); + } + /*! + * \brief seed random number generator using this seed + * \param seed seed of prng + */ + inline void Seed(int seed) { + curandStatus_t status; + status = curandSetPseudoRandomGeneratorSeed(gen_, seed); + utils::Check(status == CURAND_STATUS_SUCCESS, + "Set CURAND seed failed."); + } + /*! + * \brief generate data from uniform [a,b) + * \param dst destination + * \param a lower bound of uniform + * \param b upper bound of uniform + * \tparam dim dimension of tensor + */ + template + inline void SampleUniform(Tensor *dst, + DType a = 0.0f, DType b = 1.0f) { + if (a == 0.0f && b == 1.0f) { + *dst = this->uniform(dst.shape); + } else { + *dst = this->uniform(dst.shape) * (b - a) + a; + } + } + /*! + * \brief generate data from standard gaussian + * \param dst destination + * \param mu mean variable + * \param sigma standard deviation + * \tparam dim dimension of tensor + */ + template + inline void SampleGaussian(Tensor *dst, + DType mu = 0.0f, DType sigma = 1.0f) { + *dst = this->gaussian(dst.shape, mu, sigma); + } + /*! + * \brief return a temporal expression storing standard gaussian random variables + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \param mu mean + * \param sigma variance + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + gaussian(Shape shape, DType mu = 0.0f, DType sigma = 1.0f) { + size_t aligned_sz = ((shape.Size() + 1UL) >> 1) << 1; + // allocate alligned size + buffer_.Resize(Shape1(aligned_sz)); + buffer_.Resize(Shape1(shape.Size())); + this->GenGaussian(buffer.dptr_, aligned_sz, mu, sigma); + return expr::reshape(buffer_, shape); + } + /*! + * \brief return a temporal expression storing standard uniform [0,1) + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + uniform(Shape shape) { + buffer_.Resize(Shape1(shape.Size())); + this->GenUniform(buffer.dptr_, buffer_.size(0)); + return expr::reshape(buffer_, shape); + } + + private: + inline void GenGaussian(float *dptr, size_t size, float mu, float sigma) { + curandStatus_t status; + status = curandGenerateNormal(gen_, dptr, size, mu, sigma); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + inline void GenGaussian(double *dptr, size_t size, double mu, double sigma) { + curandStatus_t status; + status = curandGenerateNormalDouble(gen_, dptr, size, mu, sigma); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + inline void GenUniform(float *dptr, size_t size) { + curandStatus_t status; + status = curandGenerateUniform(gen_, dptr, size); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + inline void GenUniform(double *dptr, size_t size) { + curandStatus_t status; + status = curandGenerateUniformDouble(gen_, dptr, size); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + /*! \brief random numbeer generator */ + curandGenerator_t gen_; + /*! \brief templ buffer */ + TensorContainer buffer_; +}; // class Random +#endif +} // namespace mshadow +#endif // MSHADOW_RANDOM_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 8ed2baf41f18..0d460a8442cf 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -553,6 +553,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, #include "./tensor_cpu-inl.h" #include "./io.h" #include "./tensor_container.h" +#include "./random.h" // add definition of scalar related operators #ifdef MSAHDOW_SCALAR_ #error "MSHADOW_SCALAR_ must not be defined" diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index 71d096e9d89d..42542d7490da 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -65,7 +65,7 @@ class TensorContainer: public Tensor { if (s2.shape_[1] > data_.stride_ || s2.shape_[0] > data_.size(0)) { this->AllocByShape(shape); } else { - this->shape = shape; + this->shape_ = shape; if (this->pad_) { this->stride_ = data_.stride_; } else { @@ -136,15 +136,15 @@ class TensorContainer: public Tensor { // freespace inline void FreeSpace(void) { if (data_.dptr_ != NULL) { - mshadow::FreeSpace(data_); - data_.dptr = this->dptr = NULL; + mshadow::FreeSpace(&data_); + data_.dptr_ = this->dptr_ = NULL; } } inline void AllocByShape(const Shape& shape) { if (data_.dptr_ != NULL) this->FreeSpace(); data_.shape_ = shape.FlatTo2D(); - mshadow::AllocSpace(data_, pad_); - this->dptr = data_.dptr_; + mshadow::AllocSpace(&data_, pad_); + this->dptr_ = data_.dptr_; this->shape_ = shape; if (this->pad_) { this->stride_ = data_.stride_; diff --git a/mshadow/tensor_random.h b/mshadow/tensor_random.h deleted file mode 100644 index b3f0b8498e0c..000000000000 --- a/mshadow/tensor_random.h +++ /dev/null @@ -1,299 +0,0 @@ -#ifndef MSHADOW_TENSOR_RANDOM_H -#define MSHADOW_TENSOR_RANDOM_H -/*! - * \file tensor_random.h - * \brief Random inline functions for tensor. - * \author Bing Xu, Tianqi Chen - * Based on curand|MKL|stdlib - */ -#include -#include "tensor.h" -#include "tensor_container.h" - -namespace mshadow { - /*! - * \brief random number generator - * \tparam Device the device of random number generator - */ - template - class Random {}; - - /*! \brief CPU random number generator */ - template<> - class Random { - public: - /*! - * \brief constructor of random engine - * \param seed random number seed - */ - Random( int seed ){ - #if MSHADOW_USE_MKL - int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed); - utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine failed to be initialized.\n" ); - #else - srand(seed); - #endif - buffer_.Resize( Shape1( kRandBufferSize ) ); - } - ~Random() { - #if MSHADOW_USE_MKL - vslDeleteStream(&vStream_); - #endif - } - /*! - * \brief seed random number generator using this seed - * \param seed seed of prng - */ - inline void Seed( int seed ){ - #if MSHADOW_USE_MKL - int status = vslDeleteStream(&vStream_); - utils::Assert(status == VSL_STATUS_OK); - status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed); - utils::Assert(status == VSL_STATUS_OK); - #else - srand( seed ); - #endif - } - /*! - * \brief generate data from uniform [a,b) - * \param dst destination - * \param a lower bound of uniform - * \param b upper bound of uniform - * \tparam dim dimension of tensor - */ - template - inline void SampleUniform( Tensor &dst, real_t a=0.0f, real_t b=1.0f ) { - Tensor mat = dst.FlatTo2D(); - for ( index_t i = 0; i < mat.shape[1]; ++i ) { - #if MSHADOW_USE_MKL - #if MSHADOW_SINGLE_PRECISION - int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b ); - #else - int status = vdRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b ); - #endif - utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" ); - #else - // use stdlib - for ( index_t j = 0; j < mat.shape[0]; ++j ) { - mat[i][j] = this->RandNext()*(b-a) + a; - } - #endif - } - } - /*! - * \brief generate data from standard gaussian - * \param dst destination - * \param mu mean variable - * \param sigma standard deviation - * \tparam dim dimension of tensor - */ - template - inline void SampleGaussian( Tensor &dst, real_t mu = 0.0f, real_t sigma = 1.0f ) { - if( sigma <= 0.0f ) { - dst = mu; return; - } - Tensor mat = dst.FlatTo2D(); - for (index_t i = 0; i < mat.shape[1]; ++i) { - #if MSHADOW_USE_MKL - #if MSHADOW_SINGLE_PRECISION - int status = vsRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma ); - #else - int status = vdRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma ); - #endif - utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" ); - #else - real_t g1 = 0.0f, g2 = 0.0f; - for (index_t j = 0; j < mat.shape[0]; ++j) { - if( (j & 1) == 0 ){ - this->SampleNormal2D( g1, g2 ); - mat[i][j] = mu + g1 * sigma; - }else{ - mat[i][j] = mu + g2 * sigma; - } - } - #endif - } - } - /*! - * \brief return a temporal expression storing standard gaussian random variables - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> gaussian( Shape shape ){ - buffer_.Resize( Shape1( shape.Size() ) ); - this->SampleGaussian( buffer_, 0.0f, 1.0f ); - return expr::reshape( buffer_, shape ); - } - /*! - * \brief return a temporal expression storing standard uniform [0,1) - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> uniform( Shape shape ){ - buffer_.Resize( Shape1( shape.Size() ) ); - this->SampleUniform( buffer_, 0.0f, 1.0f ); - return expr::reshape( buffer_, shape ); - } - private: - /*! \brief get next random number from rand */ - inline real_t RandNext( void ){ - return static_cast(rand()) / (static_cast(RAND_MAX)+1.0f); - } - /*! \brief return a real numer uniform in (0,1) */ - inline real_t RandNext2( void ){ - return (static_cast( rand() ) + 1.0 ) / (static_cast(RAND_MAX) + 2.0); - } - /*! - * \brief sample iid xx,yy ~N(0,1) - * \param xx first gaussian output - * \param yy second gaussian output - */ - inline void SampleNormal2D( real_t &xx, real_t &yy ){ - real_t x,y,s; - do{ - x = 2.0f * RandNext2() - 1.0f; - y = 2.0f * RandNext2() - 1.0f; - s = x*x + y*y; - }while( s >= 1.0f || s == 0.0f ); - real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ; - xx = x * t; yy = y * t; - } - private: - #if MSHADOW_USE_MKL - /*! \brief stream used by MKL VSL */ - VSLStreamStatePtr vStream_; - #endif - /*! \brief temporal space used to store random numbers */ - TensorContainer buffer_; - }; // class Random - -#ifdef __CUDACC__ - - /*! \brief GPU random number generator */ - template<> - class Random { - public: - /*! - * \brief constructor of random engine - * \param seed random number seed - */ - Random(int seed) { - curandStatus_t status; - status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT); - utils::Assert(status == CURAND_STATUS_SUCCESS, "Can not create CURAND Generator"); - this->Seed( seed ); - buffer_.Resize( Shape1(kRandBufferSize) ); - } - - ~Random() { - curandStatus_t status; - status = curandDestroyGenerator(gen_); - utils::Assert(status == CURAND_STATUS_SUCCESS, "Destory CURAND Gen failed"); - } - /*! - * \brief seed random number generator using this seed - * \param seed seed of prng - */ - inline void Seed( int seed ){ - curandStatus_t status; - status = curandSetPseudoRandomGeneratorSeed(gen_, seed); - utils::Assert(status == CURAND_STATUS_SUCCESS, "Set CURAND seed failed."); - } - /*! - * \brief generate data from uniform [a,b) - * \param dst destination - * \param a lower bound of uniform - * \param b upper bound of uniform - * \tparam dim dimension of tensor - */ - template - inline void SampleUniform(Tensor &dst, real_t a=0.0f, real_t b=1.0f) { - if( a == 0.0f && b == 1.0f ){ - dst = this->uniform( dst.shape ); - }else{ - dst = this->uniform( dst.shape ) *(b-a) + a; - } - } - /*! - * \brief generate data from standard gaussian - * \param dst destination - * \param mu mean variable - * \param sigma standard deviation - * \tparam dim dimension of tensor - */ - template - inline void SampleGaussian(Tensor &dst, real_t mu = 0.0f, real_t sigma = 1.0f) { - dst = this->gaussian( dst.shape, mu, sigma ); - } - /*! - * \brief return a temporal expression storing standard gaussian random variables - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \param mu mean - * \param sigma variance - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> gaussian( Shape shape, real_t mu=0.0f, real_t sigma=1.0f){ - size_t aligned_sz = ((shape.Size() + 1UL)>>1)<<1; - // allocate alligned size - buffer_.Resize( Shape1( aligned_sz ) ); - buffer_.Resize( Shape1( shape.Size() ) ); - curandStatus_t status; - #if MSHADOW_SINGLE_PRECISION - status = curandGenerateNormal(gen_, buffer_.dptr, aligned_sz , mu, sigma); - #else - status = curandGenerateNormalDouble(gen_, buffer_.dptr, buffer_.shape[0], mu, sigma); - #endif - utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n"); - return expr::reshape( buffer_, shape ); - } - /*! - * \brief return a temporal expression storing standard uniform [0,1) - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> uniform(Shape shape) { - buffer_.Resize( Shape1( shape.Size() ) ); - curandStatus_t status; - #if MSHADOW_SINGLE_PRECISION - status = curandGenerateUniform(gen_, buffer_.dptr, buffer_.shape[0] ); - #else - status = curandGenerateUniformDouble(gen_, buffer_.dptr, buffer_.shape[0] ); - #endif - utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n"); - return expr::reshape( buffer_, shape ); - } - private: - /*! \brief random numbeer generator */ - curandGenerator_t gen_; - /*! \brief templ buffer */ - TensorContainer buffer_; - }; // class Random - #endif - -}; // namespace mshadow - -#endif // MSHADOW_TENSOR_RANDOM_H From afe5f6ed64d48f83d3df4f08298dad50f69bc86b Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 26 Dec 2014 22:52:37 -0800 Subject: [PATCH 035/147] lint pass --- mshadow/cuda/cuda_reduce.cuh | 121 --------- mshadow/cuda/reduce.cuh | 116 +++++++++ mshadow/cuda/tensor_gpu-inl.cuh | 433 ++++++++++++++++---------------- mshadow/tensor_cpu-inl.h | 2 +- mshadow/tensor_gpu-inl.h | 163 ++++++++++++ mshadow/tensor_gpu-inl.hpp | 148 ----------- 6 files changed, 497 insertions(+), 486 deletions(-) delete mode 100644 mshadow/cuda/cuda_reduce.cuh create mode 100644 mshadow/cuda/reduce.cuh create mode 100644 mshadow/tensor_gpu-inl.h delete mode 100644 mshadow/tensor_gpu-inl.hpp diff --git a/mshadow/cuda/cuda_reduce.cuh b/mshadow/cuda/cuda_reduce.cuh deleted file mode 100644 index 393132ab438d..000000000000 --- a/mshadow/cuda/cuda_reduce.cuh +++ /dev/null @@ -1,121 +0,0 @@ -#ifndef MSHADOW_CUDA_REDUCE_CUH -#define MSHADOW_CUDA_REDUCE_CUH -/*! - * \file cuda_reduce.cuh - * \brief helper functions to do reduction - * \author Tianqi Chen - */ -namespace mshadow{ - namespace cuda{ - /* - * \brief reduce over the dimension x - * \tparam Reducer reducer - * \tparam x_bits dimension = 1< - inline __device__ void Reduce1D( volatile real_t buf[1< - inline __device__ void Reduce1DNotAlign( volatile real_t buf[1< - inline __device__ void ReduceX( volatile real_t buf[], int tid ){ - if( x_bits >= 10 ){ - if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] ); - __syncthreads(); - } - if( x_bits >= 9 ){ - if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] ); - __syncthreads(); - } - if( x_bits >= 8 ){ - if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] ); - __syncthreads(); - } - if( x_bits >= 7 ){ - if( tid < 64 ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] ); - __syncthreads(); - } - if( x_bits >= 6 ){ - if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] ); - __syncthreads(); - } - // in warp optimization - if( x_bits >= 5 ){ - if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] ); - #if __CUDA_ARCH__ < 200 - __syncthreads(); - #else - __MSHADOW_EMUSYNC__; - #endif - } - if( x_bits >= 4 ){ - if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 3 ){ - if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 2 ){ - if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 1 ){ - if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] ); - __MSHADOW_EMUSYNC__; - } - }; - - template - inline __device__ void Reduce1D( volatile real_t buf[1<( buf, threadIdx.x ); - } - - // reduce with a upper bound - #define __RD_NON_ALIGN(els,x_bits) \ - els \ - if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){ \ - if( tid < (1 << x_bits) && tid + (1<( buf, tid ); \ - } \ - - template - inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){ - int tid = threadIdx.x; - __RD_NON_ALIGN(, 8) - __RD_NON_ALIGN(else, 7) - __RD_NON_ALIGN(else, 6) - __RD_NON_ALIGN(else, 5) - __RD_NON_ALIGN(else, 4) - __RD_NON_ALIGN(else, 3) - __RD_NON_ALIGN(else, 2) - __RD_NON_ALIGN(else, 1) - } - }; -}; - -#endif // MSHADOW_CUDA_REDUCE_CUH - diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh new file mode 100644 index 000000000000..05cf4d79a292 --- /dev/null +++ b/mshadow/cuda/reduce.cuh @@ -0,0 +1,116 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file reduce.cuh + * \brief helper functions to do reduction + * \author Tianqi Chen + */ +#ifndef MSHADOW_CUDA_REDUCE_CUH_ +#define MSHADOW_CUDA_REDUCE_CUH_ + +namespace mshadow { +namespace cuda { +/* + * \brief reduce over the dimension x + * \tparam Reducer reducer + * \tparam x_bits dimension = 1< +inline __device__ void Reduce1D(volatile real_t buf[1 << x_bits]); +/* + * \brief reduce over the dimension x + * \tparam Reducer reducer + * \tparam xmax_bits maximum size of buffer + * \param xsize size of x dimension, not sure if aligned + */ +template +inline __device__ void +Reduce1DNotAlign(volatile real_t buf[1 << xmax_bits], int xsize); +// ===============================================x=== +// implementations afterwards, +// no need to read if only use the functions +// -------------------------------------------------- +#ifdef __DEVICE_EMULATION__ +#define __MSHADOW_EMUSYNC__ __syncthreads() +#else +#define __MSHADOW_EMUSYNC__ +#endif + +template +inline __device__ void ReduceX(volatile real_t buf[], int tid) { + if (x_bits >= 10) { + if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]); + __syncthreads(); + } + if (x_bits >= 9) { + if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]); + __syncthreads(); + } + if (x_bits >= 8) { + if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]); + __syncthreads(); + } + if (x_bits >= 7) { + if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]); + __syncthreads(); + } + if (x_bits >= 6) { + if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]); + __syncthreads(); + } + // in warp optimization + if (x_bits >= 5) { + if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); +#if __CUDA_ARCH__ < 200 + __syncthreads(); +#else + __MSHADOW_EMUSYNC__; +#endif + } + if (x_bits >= 4) { + if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); + __MSHADOW_EMUSYNC__; + } + if (x_bits >= 3) { + if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]); + __MSHADOW_EMUSYNC__; + } + if (x_bits >= 2) { + if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]); + __MSHADOW_EMUSYNC__; + } + if (x_bits >= 1) { + if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]); + __MSHADOW_EMUSYNC__; + } +} +template +inline __device__ void Reduce1D(volatile real_t buf[1 << x_bits]) { + ReduceX(buf, threadIdx.x); +} +// reduce with a upper bound +#define __RD_NON_ALIGN(els, x_bits) \ + els \ + if (xmax_bits >= x_bits && x_size >= (1 << x_bits)) { \ + if (tid < (1 << x_bits) && tid + (1 << x_bits) < x_size) { \ + Reducer::Reduce(buf[tid] , buf[tid + (1 << x_bits)]); \ + } \ + __syncthreads(); \ + ReduceX(buf, tid); \ + } \ + +template +inline __device__ void Reduce1DNotAlign(volatile real_t buf[], int x_size) { + int tid = threadIdx.x; + __RD_NON_ALIGN(, 8) + __RD_NON_ALIGN(else, 7) + __RD_NON_ALIGN(else, 6) + __RD_NON_ALIGN(else, 5) + __RD_NON_ALIGN(else, 4) + __RD_NON_ALIGN(else, 3) + __RD_NON_ALIGN(else, 2) + __RD_NON_ALIGN(else, 1) +} +} // namespace cuda +} // namespace mshadow +#endif // MSHADOW_CUDA_REDUCE_CUH_ + diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 61e477cf531b..65d186738cf5 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -1,231 +1,232 @@ -#ifndef MSHADOW_TENSOR_GPU_INL_CUH -#define MSHADOW_TENSOR_GPU_INL_CUH /*! + * Copyright (c) 2014 by Contributors * \file tensor_gpu-inl.cuh * \brief implementation of GPU code using CUDA * \author Bing Xu, Tianqi Chen */ +#ifndef MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ +#define MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ #include "../tensor.h" -#include "cuda_reduce.cuh" +#include "./cuda_reduce.cuh" -namespace mshadow{ - namespace cuda{ - #ifndef __CUDA_ARCH__ - #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" - #endif - /* load unit for memory access */ - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 - const int kMemUnitBits = 5; - const int kMaxThreadsPerBlock = 1024; - #else - const int kMemUnitBits = 4; - const int kMaxThreadsPerBlock = 512; - #endif - /*! \brief number of units that can do synchronized update, half warp size */ - const int kMemUnit = 1 << kMemUnitBits; - /*! \brief mask that could be helpful sometime */ - const int kMemUnitMask = kMemUnit - 1; - /*! \brief suggested thread number(logscale) for mapping kernel */ - const int kBaseThreadBits = 8; - /*! \brief suggested thread number for mapping kernel */ - const int kBaseThreadNum = 1 << kBaseThreadBits; - /*! \brief maximum value of grid */ - const int kMaxGridNum = 65535; - /*! \brief suggested grid number for mapping kernel */ - const int kBaseGridNum = 1024; - - /*! \brief get align stride for given size in x dimension */ - inline index_t GetAlignStride( index_t xsize, index_t xstride ){ - if( (xstride & (kMemUnit-1)) == 0 ){ - return ( (xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; - }else{ - // if originally space is not aligned, no necessary to to alligned thread allocation - return xsize; - } - } - inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){ - if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock || - dimGrid.x > 65535 || dimGrid.y > 65535 ){ - fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z ); - utils::Error( "too large launch parameter\n"); - } - } - }; +namespace mshadow { +namespace cuda { +#ifndef __CUDA_ARCH__ +#warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" +#endif +/* load unit for memory access */ +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 +const int kMemUnitBits = 5; +const int kMaxThreadsPerBlock = 1024; +#else +const int kMemUnitBits = 4; +const int kMaxThreadsPerBlock = 512; +#endif +/*! \brief number of units that can do synchronized update, half warp size */ +const int kMemUnit = 1 << kMemUnitBits; +/*! \brief mask that could be helpful sometime */ +const int kMemUnitMask = kMemUnit - 1; +/*! \brief suggested thread number(logscale) for mapping kernel */ +const int kBaseThreadBits = 8; +/*! \brief suggested thread number for mapping kernel */ +const int kBaseThreadNum = 1 << kBaseThreadBits; +/*! \brief maximum value of grid */ +const int kMaxGridNum = 65535; +/*! \brief suggested grid number for mapping kernel */ +const int kBaseGridNum = 1024; - namespace cuda { - template - __device__ void MapPlanProc( Tensor dst, const index_t xstride, const Plan exp, int block_idx ){ - const index_t tid = (block_idx << block_dim_bits) + threadIdx.x; - const int y = tid / xstride; - const int x = tid % xstride; - if (y < dst.shape[1] && x < dst.shape[0]) { - Saver::Save(dst[y][x], exp.Eval(y,x)); - } - } - template - __global__ void MapPlanKernel( Tensor dst, const index_t xstride, const Plan exp ){ - MapPlanProc( dst, xstride, exp, blockIdx.x ); - } - template - __global__ void MapPlanLargeKernel( Tensor dst, const index_t xstride, const Plan exp, int repeat ){ - for( int i = 0; i < repeat; ++i ){ - MapPlanProc( dst, xstride, exp, blockIdx.x + i*grid_size ); - } - } - - template - inline void MapPlan( Tensor dst, const expr::Plan &plan ){ - const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ ); - const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum; - dim3 dimBlock(kBaseThreadNum, 1, 1); +/*! \brief get align stride for given size in x dimension */ +inline index_t GetAlignStride(index_t xsize, index_t xstride) { + if((xstride & (kMemUnit-1)) == 0) { + return ((xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; + }else{ + // if originally space is not aligned, no necessary to to alligned thread allocation + return xsize; + } +} +inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "") { + if(dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock || + dimGrid.x > 65535 || dimGrid.y > 65535) { + fprintf(stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z); + utils::Error("too large launch parameter\n"); + } +} +}; - if (num_block < kMaxGridNum) { - dim3 dimGrid(num_block, 1, 1); - MapPlanKernel, kBaseThreadBits> \ - <<>>(dst, xstride, plan); - } else { - int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; - dim3 dimGrid( kBaseGridNum, 1 , 1 ); - MapPlanLargeKernel, kBaseThreadBits, kBaseGridNum> \ - <<>>(dst, xstride, plan, repeat ); - } - } - }; // namespace cuda - - namespace cuda{ - template - __global__ void MapRedKeepLowestKernel( Tensor dst, Plan plan, real_t scale, Shape<2> eshape ){ - const unsigned warp_size = 1 << warp_bits; - const unsigned x = (blockIdx.x< +__device__ void MapPlanProc(Tensor dst, const index_t xstride, const Plan exp, int block_idx) { + const index_t tid = (block_idx << block_dim_bits) + threadIdx.x; + const int y = tid / xstride; + const int x = tid % xstride; + if (y < dst.shape[1] && x < dst.shape[0]) { + Saver::Save(dst[y][x], exp.Eval(y,x)); + } +} +template +__global__ void MapPlanKernel(Tensor dst, const index_t xstride, const Plan exp) { + MapPlanProc(dst, xstride, exp, blockIdx.x); +} +template +__global__ void MapPlanLargeKernel(Tensor dst, const index_t xstride, const Plan exp, int repeat) { + for(int i = 0; i < repeat; ++i) { + MapPlanProc(dst, xstride, exp, blockIdx.x + i*grid_size); + } +} - // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization - if( threadIdx.y < eshape[1] && x < eshape[0] ){ - s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x ); - } - for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){ - if( threadIdx.y + y < eshape[1] && x < eshape[0] ){ - Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) ); - } - } - __syncthreads(); - if( eshape[1] >= warp_size ){ - Reduce1D( s_res[ threadIdx.y ] ); - }else{ - Reduce1DNotAlign( s_res[ threadIdx.y ], eshape[1] ); - } - __syncthreads(); - - if( threadIdx.y == 0 && x < eshape[0] ){ - Saver::Save( dst[x], s_res[ threadIdx.x ][ 0 ] * scale ); - } - } - - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Plan &plan, real_t scale, Shape<2> eshape ){ - dim3 dimBlock( kMemUnit, kMemUnit ); - dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits ); - CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" ); - MapRedKeepLowestKernel<<>>( dst, plan, scale, eshape ); - } - }; // namespace cuda - - namespace cuda{ - template - __global__ void MapReduceKeepDim2Kernel( Tensor dst, Plan plan, real_t scale, Shape<4> pshape ){ - const int block_size = 1 << block_dim_bits; - __shared__ real_t s_rec[ block_size ]; - const int c = blockIdx.x; - const index_t tot = pshape[0]*pshape[1]*pshape[3]; +template +inline void MapPlan(Tensor dst, const expr::Plan &plan) { + const index_t xstride = GetAlignStride(dst.shape[0], dst.shape.stride_); + const int num_block = (dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum; + dim3 dimBlock(kBaseThreadNum, 1, 1); + + if (num_block < kMaxGridNum) { + dim3 dimGrid(num_block, 1, 1); + MapPlanKernel, kBaseThreadBits> \ + <<>>(dst, xstride, plan); + } else { + int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; + dim3 dimGrid(kBaseGridNum, 1 , 1); + MapPlanLargeKernel, kBaseThreadBits, kBaseGridNum> \ + <<>>(dst, xstride, plan, repeat); + } +} +}; // namespace cuda - real_t res = Reducer::kInitV; - for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){ - index_t i = i_offset + threadIdx.x; - if( i< tot ){ - const index_t x = i % pshape[0]; - i /= pshape[0]; - const index_t y = i % pshape[1]; - const index_t n = i / pshape[1]; - Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) ); - } - } - s_rec[ threadIdx.x ] = res; - __syncthreads(); - Reduce1D( s_rec ); - if( threadIdx.x == 0 ){ - Saver::Save( dst[c], s_rec[0]*scale ); - } - } +namespace cuda{ +template +__global__ void MapRedKeepLowestKernel(Tensor dst, Plan plan, real_t scale, Shape<2> eshape) { + const unsigned warp_size = 1 << warp_bits; + const unsigned x = (blockIdx.x<= warp_size) { + Reduce1D(s_res[threadIdx.y]); + }else{ + Reduce1DNotAlign(s_res[threadIdx.y], eshape[1]); + } + __syncthreads(); + + if(threadIdx.y == 0 && x < eshape[0]) { + Saver::Save(dst[x], s_res[threadIdx.x][0] * scale); + } +} - template - inline void MapReduceKeepDim2( Tensor dst, const Plan &plan, real_t scale, Shape<4> pshape ){ - dim3 dimBlock( kBaseThreadNum ); - dim3 dimGrid ( dst.shape[0] ); - CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" ); - MapReduceKeepDim2Kernel - <<>>( dst, plan, scale, pshape ); - } - }; - - namespace cuda{ - template - __global__ void SoftmaxKernel( Tensor dst, Tensor src ){ - const unsigned x_size = 1 << x_bits; - const int y = blockIdx.x; - __shared__ real_t s_rec[ x_size ]; - - // step 1: get max - if( threadIdx.x < dst.shape[ 0 ] ){ - s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; - } - for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){ - if( x + threadIdx.x < dst.shape[0] ){ - real_t a = src[ y ][ x + threadIdx.x ]; - s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] ); - } - } - __syncthreads(); - if( threadIdx.x >= dst.shape[0] ){ - s_rec[ threadIdx.x ] = s_rec[0]; - } - __syncthreads(); - Reduce1D( s_rec ); - __syncthreads(); - real_t smax = s_rec[0]; - __syncthreads(); - s_rec[ threadIdx.x ] = 0.0f; - __syncthreads(); +template +inline void MapReduceKeepLowest(Tensor dst, const expr::Plan &plan, real_t scale, Shape<2> eshape) { + dim3 dimBlock(kMemUnit, kMemUnit); + dim3 dimGrid ((eshape[0]+kMemUnit-1) >> kMemUnitBits); + CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); + MapRedKeepLowestKernel<<>>(dst, plan, scale, eshape); +} +}; // namespace cuda - // calculate normalizer, with writeback - for( unsigned x = 0; x < dst.shape[0]; x += x_size ){ - if( x + threadIdx.x < dst.shape[0] ){ - real_t p = expf( src[ y ][ x + threadIdx.x ] - smax ); - s_rec[ threadIdx.x ] += p; - // write back first, will fetch later - dst[ y ][ x + threadIdx.x ] = p; - } - } - // calculate normalizer - __syncthreads(); - Reduce1D( s_rec ); - __syncthreads(); - real_t ssum = s_rec[0]; +namespace cuda{ +template +__global__ void MapReduceKeepDim2Kernel(Tensor dst, Plan plan, real_t scale, Shape<4> pshape) { + const int block_size = 1 << block_dim_bits; + __shared__ real_t s_rec[block_size]; + const int c = blockIdx.x; + const index_t tot = pshape[0]*pshape[1]*pshape[3]; + + real_t res = Reducer::kInitV; + for(index_t i_offset = 0; i_offset < tot; i_offset += block_size) { + index_t i = i_offset + threadIdx.x; + if(i< tot) { + const index_t x = i % pshape[0]; + i /= pshape[0]; + const index_t y = i % pshape[1]; + const index_t n = i / pshape[1]; + Reducer::Reduce(res, plan.Eval((n*pshape[2] + c) * pshape[1] + y, x)); + } + } + s_rec[threadIdx.x] = res; + __syncthreads(); + Reduce1D(s_rec); + if(threadIdx.x == 0) { + Saver::Save(dst[c], s_rec[0]*scale); + } +} - for( unsigned x = 0; x < dst.shape[0]; x += x_size ){ - if( x + threadIdx.x < dst.shape[0] ){ - dst[ y ][ x + threadIdx.x ] /= ssum; - } - } - } - - inline void Softmax( Tensor &dst, const Tensor &src ){ - dim3 dimBlock( kBaseThreadNum ); - dim3 dimGrid ( dst.shape[1] ); - utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" ); - CheckLaunchParam( dimGrid, dimBlock, "Softmax" ); - SoftmaxKernel<<>>( dst, src ); - } - }; // namespace cuda +template +inline void MapReduceKeepDim2(Tensor dst, const Plan &plan, real_t scale, Shape<4> pshape) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid (dst.shape[0]); + CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim2"); + MapReduceKeepDim2Kernel + <<>>(dst, plan, scale, pshape); +} +}; + +namespace cuda{ +template +__global__ void SoftmaxKernel(Tensor dst, Tensor src) { + const unsigned x_size = 1 << x_bits; + const int y = blockIdx.x; + __shared__ real_t s_rec[x_size]; + + // step 1: get max + if(threadIdx.x < dst.shape[0]) { + s_rec[threadIdx.x] = src[y][threadIdx.x] ; + } + for(unsigned x = x_size; x < dst.shape[0]; x += x_size) { + if(x + threadIdx.x < dst.shape[0]) { + real_t a = src[y][x + threadIdx.x]; + s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); + } + } + __syncthreads(); + if(threadIdx.x >= dst.shape[0]) { + s_rec[threadIdx.x] = s_rec[0]; + } + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + real_t smax = s_rec[0]; + __syncthreads(); + s_rec[threadIdx.x] = 0.0f; + __syncthreads(); + + // calculate normalizer, with writeback + for(unsigned x = 0; x < dst.shape[0]; x += x_size) { + if(x + threadIdx.x < dst.shape[0]) { + real_t p = expf(src[y][x + threadIdx.x] - smax); + s_rec[threadIdx.x] += p; + // write back first, will fetch later + dst[y][x + threadIdx.x] = p; + } + } + // calculate normalizer + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + real_t ssum = s_rec[0]; + + for(unsigned x = 0; x < dst.shape[0]; x += x_size) { + if(x + threadIdx.x < dst.shape[0]) { + dst[y][x + threadIdx.x] /= ssum; + } + } +} + +inline void Softmax(Tensor &dst, const Tensor &src) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid (dst.shape[1]); + utils::Assert(dst.shape == src.shape, "Softmax: shape mismatch"); + CheckLaunchParam(dimGrid, dimBlock, "Softmax"); + SoftmaxKernel<<>>(dst, src); +} +}; // namespace cuda }; // namespace mshadow -#endif // TENSOR_GPU_INL_H +#endif // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index e216daabb733..70e3c8567542 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -6,9 +6,9 @@ */ #ifndef MSHADOW_TENSOR_CPU_INL_H_ #define MSHADOW_TENSOR_CPU_INL_H_ - #include #include "./base.h" +#include "./tensor.h" #include "./sse-inl.h" namespace mshadow { diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h new file mode 100644 index 000000000000..99b1a6cd4dea --- /dev/null +++ b/mshadow/tensor_gpu-inl.h @@ -0,0 +1,163 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file tensor_cpu-inl.h + * \brief implementation of CPU host code + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_TENSOR_GPU_INL_H_ +#define MSHADOW_TENSOR_GPU_INL_H_ +#include "./base.h" +#include "./tensor.h" + +namespace mshadow { +#if !(MSHADOW_USE_CUDA) +// do nothing if no GPU operation is involved +inline void InitTensorEngine(int dev_id) { +} +inline void ShutdownTensorEngine(void) { +} +#else +#if (MSHADOW_USE_NVML) +inline int AutoSelectDevice(int device_count) { + // TODO(bing): nvml device id and cuda device id are not consistent + return 0; +} +#endif +inline void InitTensorEngine(int dev_id) { + cudaDeviceProp prop; + int device_id = 0; + int device_count = 0; + cudaGetDeviceCount(&device_count); + utils::Check(device_count > 0, + "Cannot find CUDA device. Please check CUDA-Configuration"); + if (dev_id < 0) { +#if (MSHADOW_USE_NVML) + device_id = AutoSelectDevice(device_count); +#endif + } else { + device_id = dev_id; + } + utils::Check(device_id < device_count, "Incorrect Device ID"); + utils::Check(cudaSetDevice(device_id) == cudaSuccess, "cannot set device"); + cudaGetDeviceProperties(&prop, device_id); + printf("Use CUDA Device %d: %s\n", device_id, prop.name); + cublasInit(); +} +inline void ShutdownTensorEngine(void) { + cublasShutdown(); +} +template +inline void AllocSpace(Tensor *obj, bool pad) { + size_t pitch; + // common choice for cuda mem align unit is 32 + if (pad && obj.size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) { + cudaError_t err = + cudaMallocPitch(reinterpret_cast(&obj.dptr_), &pitch, + obj->size(dim - 1) * sizeof(DType), + obj->shape_.FlatTo2D()[0]); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + obj->stride_ = static_cast(pitch / sizeof(DType)); + } else { + obj->stride_ = obj->size(dim - 1); + cudaError_t err = + cudaMallocPitch(reinterpret_cast(&obj.dptr_), &pitch, + obj->shape_.Size() * sizeof(DType), 1); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } +} +template +inline void FreeSpace(Tensor *obj) { + cudaFree(obj->dptr_); obj->dptr = NULL; +} +template +inline void Copy(Tensor _dst, + Tensor _src, + cudaMemcpyKind kind) { + utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); + Tensor dst = _dst.FlatTo2D(); + Tensor src = _src.FlatTo2D(); + cudaError_t err = cudaMemcpy2D(dst.dptr_, dst.stride_ * sizeof(DType), + src.dptr_, src.stride_ * sizeof(DType), + dst.size(1) * sizeof(DType), + dst.size(0), kind); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); +} +template +inline void Copy(Tensor dst, + const Tensor &src) { + Copy(dst, src, cudaMemcpyDeviceToHost); +} +template +inline void Copy(Tensor dst, + const Tensor &src) { + Copy(dst, src, cudaMemcpyDeviceToDevice); +} +template +inline void Copy(Tensor dst, + const Tensor &src) { + Copy(dst, src, cudaMemcpyHostToDevice); +} +// the following part is included only if compiler is nvcc +#ifdef __CUDACC__ +#include "./cuda/tensor_gpu-inl.cuh" + +template +inline void MapExp(TRValue *dst, + const expr::Exp &exp) { + expr::TypeCheckPass::kMapPass> + ::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); + Shape eshape = expr::ShapeCheck::Check(exp.self()); + utils::Check(eshape[0] == 0 || eshape == dst->self().shape_, + "Assignment: Shape of Tensors are not consistent with target"); + cuda::MapPlan(MakePlan(dst->self()), + MakePlan(exp.self()), + dst->shape_.FlatTo2D()); +} + +template +inline void MapReduceKeepLowest(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + Shape<2> eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()).FlatTo2D(); + utils::Check(eshape[1] == dst->self().size(0), + "MapReduceKeepLowest::reduction dimension do not match"); + utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); + cuda::MapReduceKeepLowest + (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape); +} + +template +inline void MapReduceKeepHighDim(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + typedef Shape::kDim> EShape; + EShape eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()); + utils::Check(eshape[dimkeep] == dst->self().size(0), + "MapReduceKeepHighDim::reduction dimension do not match"); + // use equvalent form + Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), + eshape[dimkeep], + eshape.ProdShape(dimkeep, EShape::kSubdim), + eshape[EShape::kSubdim]); + // call equavalent map red dim 2 + cuda::MapReduceKeepDim2 + (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape); +} +template +inline void Softmax(Tensor dst, + const Tensor& src) { + cuda::Softmax(dst, src); +} +#endif // __CUDACC__ +#endif // MSHADOW_USE_CUDA +} // namespace mshadow +#endif // MSHADOW_TENSOR_GPU_INL_H_ diff --git a/mshadow/tensor_gpu-inl.hpp b/mshadow/tensor_gpu-inl.hpp deleted file mode 100644 index a2c1fc4a138f..000000000000 --- a/mshadow/tensor_gpu-inl.hpp +++ /dev/null @@ -1,148 +0,0 @@ -#ifndef MSHADOW_TENSOR_GPU_INL_HPP -#define MSHADOW_TENSOR_GPU_INL_HPP -/*! - * \file tensor_gpu-inl.hpp - * \brief implementation of GPU host code - * \author Bing Xu, Tianqi Chen - */ -#include "tensor.h" - -#if !(MSHADOW_USE_CUDA) -namespace mshadow { - // do nothing if no GPU operation is involved - inline void InitTensorEngine( int dev_id ){ - } - inline void ShutdownTensorEngine( void ){ - } -}; -#else -namespace mshadow { - #if (MSHADOW_USE_NVML) - inline int AutoSelectDevice(int device_count) { - // TODO nvml device id and cuda device id are not consistent - return 0; - } - #endif - inline void InitTensorEngine(int dev_id){ - cudaDeviceProp prop; - int device_id = 0; - int device_count = 0; - cudaGetDeviceCount(&device_count); - utils::Assert(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration"); - if (dev_id < 0) { - #if (MSHADOW_USE_NVML) - device_id = AutoSelectDevice(device_count); - #endif - } else { - device_id = dev_id; - } - utils::Assert( device_id < device_count, "Incorrect Device ID" ); - utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" ); - cudaGetDeviceProperties(&prop, device_id); - printf("Use CUDA Device %d: %s\n", device_id, prop.name); - cublasInit(); - } - inline void ShutdownTensorEngine( void ){ - cublasShutdown(); - } - - template - inline void AllocSpace(Tensor &obj, bool pad){ - size_t pitch; - // common choice for cuda mem align unit is 32 - if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){ - cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ - obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); - utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); - obj.shape.stride_ = static_cast( pitch / sizeof(real_t) ); - }else{ - obj.shape.stride_ = obj.shape[0]; - cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ - obj.shape.Size() * sizeof(real_t), 1 ); - utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); - } - } - - template - inline void FreeSpace(Tensor &obj){ - cudaFree( obj.dptr ); obj.dptr = NULL; - } - - template - inline void Copy(Tensor _dst, Tensor _src, cudaMemcpyKind kind){ - utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); - Tensor dst = _dst.FlatTo2D(); - Tensor src = _src.FlatTo2D(); - cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t), - src.dptr, src.shape.stride_ * sizeof(real_t), - dst.shape[0] * sizeof(real_t), - dst.shape[1], kind ); - utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); - } - template - inline void Copy(Tensor dst, const Tensor &src){ - Copy( dst, src, cudaMemcpyDeviceToHost ); - } - template - inline void Copy(Tensor dst, const Tensor &src){ - Copy( dst, src, cudaMemcpyDeviceToDevice ); - } - template - inline void Copy(Tensor dst, const Tensor &src){ - Copy( dst, src, cudaMemcpyHostToDevice ); - } -}; - -#ifdef __CUDACC__ -// the following part is included only if compiler is nvcc -#include "cuda/tensor_gpu-inl.cuh" - -namespace mshadow{ - template - inline void MapPlan(Tensor _dst, const expr::Plan &plan){ - cuda::MapPlan( _dst.FlatTo2D(), plan ); - } - - template - inline void MapExp(Tensor dst, const expr::Exp &exp ){ - using namespace expr; - TypeCheckPass< TypeCheck::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); - Shape eshape = ShapeCheck::Check( exp.self() ); - utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" ); - MapPlan( dst, MakePlan( exp.self() ) ); - } - - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - Shape<2> eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ).FlatTo2D(); - - utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" ); - utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" ); - cuda::MapReduceKeepLowest( dst, MakePlan( exp.self() ), scale, eshape ); - } - - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - typedef Shape< ExpInfo::kDim > EShape; - EShape eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ); - utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" ); - // use equvalent form - Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], - eshape.ProdShape(1,dimkeep), eshape[0] ); - // call equavalent map red dim 2 - cuda::MapReduceKeepDim2( dst, MakePlan( exp.self() ), scale, pshape ); - } - - inline void Softmax( Tensor dst, const Tensor& src ){ - cuda::Softmax( dst, src ); - } -}; // namespace mshadow - -#endif // __CUDACC__ - -#endif // MSHADOW_USE_CUDA -#endif // TENSOR_GPU_INL_HPP From 236644c537d5fa006ab83c1496ae9b09db3b3aeb Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 27 Dec 2014 05:50:37 -0800 Subject: [PATCH 036/147] ok --- mshadow/cuda/tensor_gpu-inl.cuh | 258 +++++++++++++++++--------------- mshadow/tensor_gpu-inl.h | 5 +- 2 files changed, 141 insertions(+), 122 deletions(-) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 65d186738cf5..3712316445c7 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -11,10 +11,11 @@ namespace mshadow { namespace cuda { +/*! \brief seems CUDAARCH is deprecated in future NVCC */ #ifndef __CUDA_ARCH__ -#warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" +//#warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" #endif -/* load unit for memory access */ +/* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 const int kMemUnitBits = 5; const int kMaxThreadsPerBlock = 1024; @@ -23,7 +24,7 @@ const int kMemUnitBits = 4; const int kMaxThreadsPerBlock = 512; #endif /*! \brief number of units that can do synchronized update, half warp size */ -const int kMemUnit = 1 << kMemUnitBits; +const int kMemUnit = 1 << kMemUnitBits; /*! \brief mask that could be helpful sometime */ const int kMemUnitMask = kMemUnit - 1; /*! \brief suggested thread number(logscale) for mapping kernel */ @@ -31,202 +32,219 @@ const int kBaseThreadBits = 8; /*! \brief suggested thread number for mapping kernel */ const int kBaseThreadNum = 1 << kBaseThreadBits; /*! \brief maximum value of grid */ -const int kMaxGridNum = 65535; +const int kMaxGridNum = 65535; /*! \brief suggested grid number for mapping kernel */ -const int kBaseGridNum = 1024; - +const int kBaseGridNum = 1024; /*! \brief get align stride for given size in x dimension */ inline index_t GetAlignStride(index_t xsize, index_t xstride) { - if((xstride & (kMemUnit-1)) == 0) { + if ((xstride & (kMemUnit - 1)) == 0) { return ((xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; - }else{ + } else { // if originally space is not aligned, no necessary to to alligned thread allocation return xsize; } } inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "") { - if(dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock || - dimGrid.x > 65535 || dimGrid.y > 65535) { + if (dimBlock.x * dimBlock.y * dimBlock.z > static_cast(kMaxThreadsPerBlock) || + dimGrid.x > 65535 || dimGrid.y > 65535) { fprintf(stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z); utils::Error("too large launch parameter\n"); } -} -}; - -namespace cuda { -template -__device__ void MapPlanProc(Tensor dst, const index_t xstride, const Plan exp, int block_idx) { +} +template +__device__ void MapPlanProc(DstPlan dst, index_t xstride, + Shape<2> dshape, const Plan exp, int block_idx) { const index_t tid = (block_idx << block_dim_bits) + threadIdx.x; - const int y = tid / xstride; - const int x = tid % xstride; - if (y < dst.shape[1] && x < dst.shape[0]) { - Saver::Save(dst[y][x], exp.Eval(y,x)); + const int y = tid / xstride; + const int x = tid % xstride; + if (y < dshape[0] && x < dshape[1]) { + Saver::Save(dst.Eval(y, x), exp.Eval(y,x)); } } -template -__global__ void MapPlanKernel(Tensor dst, const index_t xstride, const Plan exp) { - MapPlanProc(dst, xstride, exp, blockIdx.x); +template +__global__ void MapPlanKernel(DstPlan dst, index_t xstride, + Shape<2> dshape, const Plan exp) { + MapPlanProc + (dst, xstride, dshape, exp, blockIdx.x); } -template -__global__ void MapPlanLargeKernel(Tensor dst, const index_t xstride, const Plan exp, int repeat) { - for(int i = 0; i < repeat; ++i) { - MapPlanProc(dst, xstride, exp, blockIdx.x + i*grid_size); +template +__global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride, + Shape<2> dshape, const Plan exp, int repeat) { + for (int i = 0; i < repeat; ++i) { + MapPlanProc + (dst, xstride, dshape, exp, blockIdx.x + i * grid_size); } -} +} -template -inline void MapPlan(Tensor dst, const expr::Plan &plan) { - const index_t xstride = GetAlignStride(dst.shape[0], dst.shape.stride_); - const int num_block = (dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum; +template +inline void MapPlan(expr::Plan dst, + const expr::Plan &plan, + Shape<2> dshape, index_t dstride) { + const index_t xstride = GetAlignStride(dshape[1], dstride); + const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum; dim3 dimBlock(kBaseThreadNum, 1, 1); if (num_block < kMaxGridNum) { dim3 dimGrid(num_block, 1, 1); - MapPlanKernel, kBaseThreadBits> \ - <<>>(dst, xstride, plan); + MapPlanKernel, + expr::Plan > + <<>>(dst, xstride, plan, dshape); } else { int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; dim3 dimGrid(kBaseGridNum, 1 , 1); - MapPlanLargeKernel, kBaseThreadBits, kBaseGridNum> \ - <<>>(dst, xstride, plan, repeat); + MapPlanLargeKernel, + expr::Plan > + <<>>(dst, xstride, plan, dshape, repeat); } -} -}; // namespace cuda +} -namespace cuda{ -template -__global__ void MapRedKeepLowestKernel(Tensor dst, Plan plan, real_t scale, Shape<2> eshape) { +template +__global__ void MapRedKeepLowestKernel(DstPlan dst, Plan plan, + DType scale, Shape<2> eshape) { const unsigned warp_size = 1 << warp_bits; - const unsigned x = (blockIdx.x<= warp_size) { + Reduce1D(s_res[threadIdx.y]); + } else { + Reduce1DNotAlign(s_res[threadIdx.y], eshape[0]); + } __syncthreads(); - if(eshape[1] >= warp_size) { - Reduce1D(s_res[threadIdx.y]); - }else{ - Reduce1DNotAlign(s_res[threadIdx.y], eshape[1]); + + if (threadIdx.y == 0 && x < eshape[1]) { + Saver::Save(dst.Eval(0, x), s_res[threadIdx.x][0] * scale); } - __syncthreads(); - - if(threadIdx.y == 0 && x < eshape[0]) { - Saver::Save(dst[x], s_res[threadIdx.x][0] * scale); - } -} +} -template -inline void MapReduceKeepLowest(Tensor dst, const expr::Plan &plan, real_t scale, Shape<2> eshape) { +template +inline void MapReduceKeepLowest(expr::Plan dst, + const expr::Plan &plan, + DType scale, Shape<2> eshape) { dim3 dimBlock(kMemUnit, kMemUnit); - dim3 dimGrid ((eshape[0]+kMemUnit-1) >> kMemUnitBits); + dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits); CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); - MapRedKeepLowestKernel<<>>(dst, plan, scale, eshape); -} -}; // namespace cuda + MapRedKeepLowestKernel, + expr::Plan > + <<>>(dst, plan, scale, eshape); +} -namespace cuda{ -template -__global__ void MapReduceKeepDim2Kernel(Tensor dst, Plan plan, real_t scale, Shape<4> pshape) { +template +__global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Shape<4> pshape) { const int block_size = 1 << block_dim_bits; - __shared__ real_t s_rec[block_size]; - const int c = blockIdx.x; - const index_t tot = pshape[0]*pshape[1]*pshape[3]; + __shared__ DType s_rec[block_size]; + const int c = blockIdx.x; + const index_t tot = pshape[3] * pshape[2] * pshape[0]; - real_t res = Reducer::kInitV; - for(index_t i_offset = 0; i_offset < tot; i_offset += block_size) { + DType res; Reducer::SetInitValue(res); + for (index_t i_offset = 0; i_offset < tot; i_offset += block_size) { index_t i = i_offset + threadIdx.x; - if(i< tot) { - const index_t x = i % pshape[0]; - i /= pshape[0]; - const index_t y = i % pshape[1]; - const index_t n = i / pshape[1]; - Reducer::Reduce(res, plan.Eval((n*pshape[2] + c) * pshape[1] + y, x)); + if (i< tot) { + const index_t x = i % pshape[3]; + i /= pshape[3]; + const index_t y = i % pshape[2]; + const index_t n = i / pshape[2]; + Reducer::Reduce(res, plan.Eval((n * pshape[1] + c) * pshape[2] + y, x)); } - } + } s_rec[threadIdx.x] = res; __syncthreads(); - Reduce1D(s_rec); - if(threadIdx.x == 0) { - Saver::Save(dst[c], s_rec[0]*scale); + Reduce1D(s_rec); + if (threadIdx.x == 0) { + Saver::Save(dst.Eval(0, c), s_rec[0] * scale); } } -template -inline void MapReduceKeepDim2(Tensor dst, const Plan &plan, real_t scale, Shape<4> pshape) { +template +inline void MapReduceKeepDim1(expr::Plan dst, + const expr::Plan &plan, + DType scale, Shape<4> pshape) { dim3 dimBlock(kBaseThreadNum); - dim3 dimGrid (dst.shape[0]); - CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim2"); - MapReduceKeepDim2Kernel - <<>>(dst, plan, scale, pshape); + dim3 dimGrid (pshape[1]); + CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim1"); + MapReduceKeepDim1Kernel, + expr::Plan > + <<>>(dst, plan, scale, pshape); } -}; -namespace cuda{ -template -__global__ void SoftmaxKernel(Tensor dst, Tensor src) { - const unsigned x_size = 1 << x_bits; +template +__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { + const unsigned x_size = 1 << x_bits; const int y = blockIdx.x; - __shared__ real_t s_rec[x_size]; - + __shared__ DType s_rec[x_size]; // step 1: get max - if(threadIdx.x < dst.shape[0]) { - s_rec[threadIdx.x] = src[y][threadIdx.x] ; + if (threadIdx.x < xmax) { + s_rec[threadIdx.x] = src.Eval(y, threadIdx.x); } - for(unsigned x = x_size; x < dst.shape[0]; x += x_size) { - if(x + threadIdx.x < dst.shape[0]) { - real_t a = src[y][x + threadIdx.x]; + for (unsigned x = x_size; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType a = src[y][x + threadIdx.x]; s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); } } __syncthreads(); - if(threadIdx.x >= dst.shape[0]) { + if (threadIdx.x >= xmax) { s_rec[threadIdx.x] = s_rec[0]; } __syncthreads(); - Reduce1D(s_rec); + Reduce1D(s_rec); __syncthreads(); - real_t smax = s_rec[0]; + DType smax = s_rec[0]; __syncthreads(); s_rec[threadIdx.x] = 0.0f; __syncthreads(); - + // calculate normalizer, with writeback - for(unsigned x = 0; x < dst.shape[0]; x += x_size) { - if(x + threadIdx.x < dst.shape[0]) { - real_t p = expf(src[y][x + threadIdx.x] - smax); + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + real_t p = expf(src.Eval(y, x + threadIdx.x) - smax); s_rec[threadIdx.x] += p; // write back first, will fetch later - dst[y][x + threadIdx.x] = p; + dst.Eval(y, x + threadIdx.x) = p; } } // calculate normalizer __syncthreads(); - Reduce1D(s_rec); + Reduce1D(s_rec); __syncthreads(); - real_t ssum = s_rec[0]; + DType ssum = s_rec[0]; - for(unsigned x = 0; x < dst.shape[0]; x += x_size) { - if(x + threadIdx.x < dst.shape[0]) { - dst[y][x + threadIdx.x] /= ssum; + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + dst.Eval(y, x + threadIdx.x) /= ssum; } } } - -inline void Softmax(Tensor &dst, const Tensor &src) { +template +inline void Softmax(Tensor &dst, + const Tensor &src) { dim3 dimBlock(kBaseThreadNum); - dim3 dimGrid (dst.shape[1]); - utils::Assert(dst.shape == src.shape, "Softmax: shape mismatch"); + dim3 dimGrid(dst.shape[0]); + utils::Check(dst.shape_ == src.shape_, "Softmax: shape mismatch"); CheckLaunchParam(dimGrid, dimBlock, "Softmax"); - SoftmaxKernel<<>>(dst, src); + SoftmaxKernel + <<>>(expr::MakePlan(dst), expr::MakePlan(src)); } -}; // namespace cuda -}; // namespace mshadow +} // namespace cuda +} // namespace mshadow #endif // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 99b1a6cd4dea..e3803161f5a2 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -112,7 +112,8 @@ inline void MapExp(TRValue *dst, "Assignment: Shape of Tensors are not consistent with target"); cuda::MapPlan(MakePlan(dst->self()), MakePlan(exp.self()), - dst->shape_.FlatTo2D()); + dst->shape_.FlatTo2D(), + dst->stride_); } template *dst, eshape.ProdShape(dimkeep, EShape::kSubdim), eshape[EShape::kSubdim]); // call equavalent map red dim 2 - cuda::MapReduceKeepDim2 + cuda::MapReduceKeepDim1 (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape); } template From fa55b5862d4fe8accf5af0625ac38ed0e191ebbf Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 27 Dec 2014 20:25:03 -0800 Subject: [PATCH 037/147] make cuda part compile --- mshadow/base.h | 6 +++--- mshadow/cuda/reduce.cuh | 22 ++++++++++++---------- mshadow/cuda/tensor_gpu-inl.cuh | 28 ++++++++++++++-------------- mshadow/random.h | 14 +++++++------- mshadow/tensor.h | 1 + mshadow/tensor_gpu-inl.h | 20 +++++++++++--------- 6 files changed, 48 insertions(+), 43 deletions(-) diff --git a/mshadow/base.h b/mshadow/base.h index 640086883bf3..e5be86aa4bf3 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -37,9 +37,9 @@ #endif #if MSHADOW_STAND_ALONE - #define MSHADOW_USE_CBLAS 1 - #define MSHADOW_USE_MKL 0 - #define MSHADOW_USE_CUDA 0 + #define MSHADOW_USE_CBLAS 0 + #define MSHADOW_USE_MKL 1 + #define MSHADOW_USE_CUDA 1 #endif /*! \brief use CBLAS for CBLAS */ diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh index 05cf4d79a292..3baaf7a57ed6 100644 --- a/mshadow/cuda/reduce.cuh +++ b/mshadow/cuda/reduce.cuh @@ -13,18 +13,20 @@ namespace cuda { * \brief reduce over the dimension x * \tparam Reducer reducer * \tparam x_bits dimension = 1< -inline __device__ void Reduce1D(volatile real_t buf[1 << x_bits]); +template +inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]); /* * \brief reduce over the dimension x * \tparam Reducer reducer * \tparam xmax_bits maximum size of buffer + * \tparam DType content data type * \param xsize size of x dimension, not sure if aligned */ -template +template inline __device__ void -Reduce1DNotAlign(volatile real_t buf[1 << xmax_bits], int xsize); +Reduce1DNotAlign(volatile DType buf[1 << xmax_bits], int xsize); // ===============================================x=== // implementations afterwards, // no need to read if only use the functions @@ -35,8 +37,8 @@ Reduce1DNotAlign(volatile real_t buf[1 << xmax_bits], int xsize); #define __MSHADOW_EMUSYNC__ #endif -template -inline __device__ void ReduceX(volatile real_t buf[], int tid) { +template +inline __device__ void ReduceX(volatile DType buf[], int tid) { if (x_bits >= 10) { if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]); __syncthreads(); @@ -83,8 +85,8 @@ inline __device__ void ReduceX(volatile real_t buf[], int tid) { __MSHADOW_EMUSYNC__; } } -template -inline __device__ void Reduce1D(volatile real_t buf[1 << x_bits]) { +template +inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]) { ReduceX(buf, threadIdx.x); } // reduce with a upper bound @@ -98,8 +100,8 @@ inline __device__ void Reduce1D(volatile real_t buf[1 << x_bits]) { ReduceX(buf, tid); \ } \ -template -inline __device__ void Reduce1DNotAlign(volatile real_t buf[], int x_size) { +template +inline __device__ void Reduce1DNotAlign(volatile DType buf[], int x_size) { int tid = threadIdx.x; __RD_NON_ALIGN(, 8) __RD_NON_ALIGN(else, 7) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 3712316445c7..5db82d88b701 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -7,7 +7,7 @@ #ifndef MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ #define MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ #include "../tensor.h" -#include "./cuda_reduce.cuh" +#include "./reduce.cuh" namespace mshadow { namespace cuda { @@ -79,8 +79,8 @@ __global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride, } } -template -inline void MapPlan(expr::Plan dst, +template +inline void MapPlan(expr::Plan dst, const expr::Plan &plan, Shape<2> dshape, index_t dstride) { const index_t xstride = GetAlignStride(dshape[1], dstride); @@ -90,16 +90,16 @@ inline void MapPlan(expr::Plan dst, if (num_block < kMaxGridNum) { dim3 dimGrid(num_block, 1, 1); MapPlanKernel, + expr::Plan, expr::Plan > - <<>>(dst, xstride, plan, dshape); + <<>>(dst, xstride, dshape, plan); } else { int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; dim3 dimGrid(kBaseGridNum, 1 , 1); MapPlanLargeKernel, + expr::Plan, expr::Plan > - <<>>(dst, xstride, plan, dshape, repeat); + <<>>(dst, xstride, dshape, plan, repeat); } } @@ -134,15 +134,15 @@ __global__ void MapRedKeepLowestKernel(DstPlan dst, Plan plan, } template -inline void MapReduceKeepLowest(expr::Plan dst, + typename DstExp, typename E, typename DType> +inline void MapReduceKeepLowest(expr::Plan dst, const expr::Plan &plan, DType scale, Shape<2> eshape) { dim3 dimBlock(kMemUnit, kMemUnit); dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits); CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); MapRedKeepLowestKernel, + expr::Plan, expr::Plan > <<>>(dst, plan, scale, eshape); } @@ -174,15 +174,15 @@ __global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Sha } } -template -inline void MapReduceKeepDim1(expr::Plan dst, +template +inline void MapReduceKeepDim1(expr::Plan dst, const expr::Plan &plan, DType scale, Shape<4> pshape) { dim3 dimBlock(kBaseThreadNum); dim3 dimGrid (pshape[1]); CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim1"); MapReduceKeepDim1Kernel, + expr::Plan, expr::Plan > <<>>(dst, plan, scale, pshape); } @@ -217,7 +217,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { // calculate normalizer, with writeback for (unsigned x = 0; x < xmax; x += x_size) { if (x + threadIdx.x < xmax) { - real_t p = expf(src.Eval(y, x + threadIdx.x) - smax); + DType p = expf(src.Eval(y, x + threadIdx.x) - smax); s_rec[threadIdx.x] += p; // write back first, will fetch later dst.Eval(y, x + threadIdx.x) = p; diff --git a/mshadow/random.h b/mshadow/random.h index c9875ded36d2..432a5d107b3d 100644 --- a/mshadow/random.h +++ b/mshadow/random.h @@ -214,7 +214,7 @@ class Random { * \brief constructor of random engine * \param seed random number seed */ - Random(int seed) { + Random(int seed) { curandStatus_t status; status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT); utils::Check(status == CURAND_STATUS_SUCCESS, @@ -223,7 +223,7 @@ class Random { buffer_.Resize(Shape1(kRandBufferSize)); } - ~Random(void) { + ~Random(void) { curandStatus_t status; status = curandDestroyGenerator(gen_); utils::Check(status == CURAND_STATUS_SUCCESS, @@ -250,9 +250,9 @@ class Random { inline void SampleUniform(Tensor *dst, DType a = 0.0f, DType b = 1.0f) { if (a == 0.0f && b == 1.0f) { - *dst = this->uniform(dst.shape); + *dst = this->uniform(dst->shape_); } else { - *dst = this->uniform(dst.shape) * (b - a) + a; + *dst = this->uniform(dst->shape_) * (b - a) + a; } } /*! @@ -265,7 +265,7 @@ class Random { template inline void SampleGaussian(Tensor *dst, DType mu = 0.0f, DType sigma = 1.0f) { - *dst = this->gaussian(dst.shape, mu, sigma); + *dst = this->gaussian(dst->shape_, mu, sigma); } /*! * \brief return a temporal expression storing standard gaussian random variables @@ -286,7 +286,7 @@ class Random { // allocate alligned size buffer_.Resize(Shape1(aligned_sz)); buffer_.Resize(Shape1(shape.Size())); - this->GenGaussian(buffer.dptr_, aligned_sz, mu, sigma); + this->GenGaussian(buffer_.dptr_, aligned_sz, mu, sigma); return expr::reshape(buffer_, shape); } /*! @@ -303,7 +303,7 @@ class Random { inline expr::ReshapeExp, DType, dim, 1> uniform(Shape shape) { buffer_.Resize(Shape1(shape.Size())); - this->GenUniform(buffer.dptr_, buffer_.size(0)); + this->GenUniform(buffer_.dptr_, buffer_.size(0)); return expr::reshape(buffer_, shape); } diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 0d460a8442cf..8f5fb185e2ad 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -551,6 +551,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, #include "./expr_engine-inl.h" #include "./extension.h" #include "./tensor_cpu-inl.h" +#include "./tensor_gpu-inl.h" #include "./io.h" #include "./tensor_container.h" #include "./random.h" diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index e3803161f5a2..f79fe82d2863 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -50,9 +50,9 @@ template inline void AllocSpace(Tensor *obj, bool pad) { size_t pitch; // common choice for cuda mem align unit is 32 - if (pad && obj.size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) { + if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) { cudaError_t err = - cudaMallocPitch(reinterpret_cast(&obj.dptr_), &pitch, + cudaMallocPitch(reinterpret_cast(&(obj->dptr_)), &pitch, obj->size(dim - 1) * sizeof(DType), obj->shape_.FlatTo2D()[0]); utils::Check(err == cudaSuccess, cudaGetErrorString(err)); @@ -60,7 +60,7 @@ inline void AllocSpace(Tensor *obj, bool pad) { } else { obj->stride_ = obj->size(dim - 1); cudaError_t err = - cudaMallocPitch(reinterpret_cast(&obj.dptr_), &pitch, + cudaMallocPitch(reinterpret_cast(&(obj->dptr_)), &pitch, obj->shape_.Size() * sizeof(DType), 1); utils::Check(err == cudaSuccess, cudaGetErrorString(err)); } @@ -74,8 +74,8 @@ inline void Copy(Tensor _dst, Tensor _src, cudaMemcpyKind kind) { utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); - Tensor dst = _dst.FlatTo2D(); - Tensor src = _src.FlatTo2D(); + Tensor dst = _dst.FlatTo2D(); + Tensor src = _src.FlatTo2D(); cudaError_t err = cudaMemcpy2D(dst.dptr_, dst.stride_ * sizeof(DType), src.dptr_, src.stride_ * sizeof(DType), dst.size(1) * sizeof(DType), @@ -97,10 +97,12 @@ inline void Copy(Tensor dst, const Tensor &src) { Copy(dst, src, cudaMemcpyHostToDevice); } +} // namespace mshadow // the following part is included only if compiler is nvcc #ifdef __CUDACC__ #include "./cuda/tensor_gpu-inl.cuh" +namespace mshadow { template inline void MapExp(TRValue *dst, @@ -112,13 +114,13 @@ inline void MapExp(TRValue *dst, "Assignment: Shape of Tensors are not consistent with target"); cuda::MapPlan(MakePlan(dst->self()), MakePlan(exp.self()), - dst->shape_.FlatTo2D(), - dst->stride_); + dst->self().shape_.FlatTo2D(), + dst->self().stride_); } template -inline void MapReduceKeepLowest(TRValue *dst, +inline void MapReduceKeepLowest(TRValue *dst, const expr::Exp &exp, DType scale) { expr::TypeCheckPass::kRedPass> @@ -134,7 +136,7 @@ inline void MapReduceKeepLowest(TRValue *dst, template -inline void MapReduceKeepHighDim(TRValue *dst, +inline void MapReduceKeepHighDim(TRValue *dst, const expr::Exp &exp, DType scale) { expr::TypeCheckPass::kRedPass> From c45fc2c7cb0cd406aa345e8fd00d5ec94ee0a951 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 27 Dec 2014 21:07:00 -0800 Subject: [PATCH 038/147] fix compile --- example/Makefile | 5 +- example/neuralnet/convnet.cu | 445 +++++++++++++++-------------- example/neuralnet/nnet.cu | 304 ++++++++++---------- example/neuralnet/util.h | 124 ++++---- mshadow/base.h | 2 +- mshadow/cuda/tensor_gpu-inl.cuh | 8 +- mshadow/extension/channel_pool.h | 2 +- mshadow/extension/pack_col2patch.h | 2 +- mshadow/extension/spatial_pool.h | 2 +- mshadow/extension/spatial_unpool.h | 2 +- mshadow/tensor.h | 2 +- mshadow/tensor_container.h | 2 +- mshadow/tensor_cpu-inl.h | 6 +- mshadow/tensor_gpu-inl.h | 2 +- 14 files changed, 459 insertions(+), 449 deletions(-) diff --git a/example/Makefile b/example/Makefile index 836b192f90d3..8dc0aee554a9 100644 --- a/example/Makefile +++ b/example/Makefile @@ -1,6 +1,6 @@ # set LD_LIBRARY_PATH export CC = gcc -export CXX = clang++ +export CXX = g++ export NVCC =nvcc export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ export LDFLAGS= -lm @@ -10,13 +10,14 @@ export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) BIN = basic defop OBJ = CUOBJ = -CUBIN = +CUBIN = basicx .PHONY: clean all all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) basic: basic.cpp defop: defop.cpp +basicx: basic.cu $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/example/neuralnet/convnet.cu b/example/neuralnet/convnet.cu index 448810e126f4..de1bd49e18c5 100644 --- a/example/neuralnet/convnet.cu +++ b/example/neuralnet/convnet.cu @@ -11,24 +11,24 @@ using namespace mshadow::expr; // define operations struct relu{ - MSHADOW_XINLINE static real_t Map(real_t a) { - using namespace std; - return max( a, 0.0f ); - } + MSHADOW_XINLINE static real_t Map(real_t a) { + using namespace std; + return max(a, 0.0f); + } }; struct relu_grad { - MSHADOW_XINLINE static real_t Map(real_t a) { - return a > 0.0f ? 1.0f : 0.0f; - } + MSHADOW_XINLINE static real_t Map(real_t a) { + return a > 0.0f ? 1.0f : 0.0f; + } }; /*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ class INNet{ -public: - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ) = 0; - virtual void Backprop( const Tensor& gradout ) = 0; - virtual void Update( void ) = 0; - virtual ~INNet(){} + public: + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) = 0; + virtual void Backprop(const Tensor& gradout) = 0; + virtual void Update(void) = 0; + virtual ~INNet() {} }; /*! @@ -36,222 +36,227 @@ public: * this implementation is device invariant */ template -class ConvNet : public INNet{ -public: - // initialize the network - ConvNet( int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out ) - :rnd(0), ksize(ksize), kstride(kstride), psize(psize){ - // setup nodes - ninput.Resize( Shape4( batch_size, 1, insize, insize ) ); - nhidden.Resize( Shape4( batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1) ); - nhiddenbak.Resize( nhidden.shape ); - npool.Resize( Shape4( batch_size, nchannel, (nhidden.shape[1]+1-psize)/psize, (nhidden.shape[0]+1-psize)/psize ) ); - npoolbak.Resize( npool.shape ); - nflat.Resize( Shape2( batch_size, npool.shape[2]*npool.shape[1]*npool.shape[0] ) ); - nout.Resize( Shape2( batch_size, num_out ) ); - // setup bias - hbias.Resize( Shape1( nchannel ) ); g_hbias.Resize( hbias.shape ); - obias.Resize( Shape1( num_out ) ); g_obias.Resize( obias.shape ); - hbias = 0.0f; obias = 0.0f; - // setup weights - Ki2h.Resize( Shape2( nchannel, ksize*ksize ) ); g_Ki2h.Resize( Ki2h.shape ); - Wh2o.Resize( Shape2( nflat.shape[0], num_out ) ); g_Wh2o.Resize( Wh2o.shape ); - rnd.SampleGaussian( Ki2h, 0, 0.01f ); - rnd.SampleGaussian( Wh2o, 0, 0.01f ); - - printf("conv=%d, pool=%d\n", nhidden.shape[0], npool.shape[0] ); - } - virtual ~ConvNet(){} - // forward propagation - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ){ - index_t batch_size = inbatch.shape[3]; +class ConvNet : public INNet { + public: + // initialize the network + ConvNet(int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out) + :rnd(0), ksize(ksize), kstride(kstride), psize(psize) { + // setup nodes + ninput.Resize(Shape4(batch_size, 1, insize, insize)); + nhidden.Resize(Shape4(batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1)); + nhiddenbak.Resize(nhidden.shape_); + npool.Resize(Shape4(batch_size, nchannel, (nhidden.size(2)+1-psize)/psize, (nhidden.size(3)+1-psize)/psize)); + npoolbak.Resize(npool.shape_); + nflat.Resize(Shape2(batch_size, npool.size(0)*npool.size(1)*npool.size(2))); + nout.Resize(Shape2(batch_size, num_out)); + // setup bias + hbias.Resize(Shape1(nchannel)); g_hbias.Resize(hbias.shape_); + obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_); + hbias = 0.0f; obias = 0.0f; + // setup weights + Ki2h.Resize(Shape2(nchannel, ksize*ksize)); g_Ki2h.Resize(Ki2h.shape_); + Wh2o.Resize(Shape2(nflat.size(1), num_out)); g_Wh2o.Resize(Wh2o.shape_); + rnd.SampleGaussian(&Ki2h, 0, 0.01f); + rnd.SampleGaussian(&Wh2o, 0, 0.01f); + + printf("conv=%d, pool=%d\n", nhidden.size(3), npool.size(3)); + } + virtual ~ConvNet() {} + // forward propagation + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) { + index_t batch_size = inbatch.size(0); // copy data to input layer - Copy( ninput, inbatch ); - // first layer, conv, use stride=2 - ConvForward( ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst ); - // add bias - nhidden += broadcast<2>( hbias, nhidden.shape ); - // activation, relu, backup activation in nhidden - nhidden = F( nhidden ); - Copy( nhiddenbak, nhidden ); - // max pooling - npool = pool( nhiddenbak, npool[0][0].shape, psize, psize ); - Copy( npoolbak, npool ); - // flat - nflat = reshape( npool, nflat.shape ); - // second layer fullc - nout = dot( nflat, Wh2o ); - nout += repmat( obias, batch_size ); - // softmax calculation - Softmax( nout, nout ); - // copy result out - Copy( oubatch, nout ); - } - // back propagation - virtual void Backprop( const Tensor& gradout ){ - // copy gradient to output layer - Copy( nout, gradout ); - // calc grad of final layer - g_obias = sum_rows( nout ); - g_Wh2o = dot( nflat.T(), nout ); - // backprop to previous layer - nflat = dot( nout, Wh2o.T() ); - npool = reshape( nflat, npool.shape ); - // backprop pooling layer - nhiddenbak = unpool( nhiddenbak, npoolbak, npool, psize, psize ); - // calculate gradient of relu layer - nhidden = F( nhidden ) * nhiddenbak; - // calc grad of layer 1 - g_hbias = sumall_except_dim<2>( nhidden ); - ConvBackWard( nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst ); - } + Copy(ninput, inbatch); + // first layer, conv, use stride=2 + ConvForward(ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst); + // add bias + nhidden += broadcast<1>(hbias, nhidden.shape_); + // activation, relu, backup activation in nhidden + nhidden = F(nhidden); + Copy(nhiddenbak, nhidden); + // max pooling + npool = pool(nhiddenbak, npool[0][0].shape_, psize, psize, psize); + Copy(npoolbak, npool); + // flat + nflat = reshape(npool, nflat.shape_); + // second layer fullc + nout = dot(nflat, Wh2o); + nout += repmat(obias, batch_size); + // softmax calculation + Softmax(nout, nout); + // copy result out + Copy(oubatch, nout); + } + // back propagation + virtual void Backprop(const Tensor& gradout) { + // copy gradient to output layer + Copy(nout, gradout); + // calc grad of final layer + g_obias = sum_rows(nout); + g_Wh2o = dot(nflat.T(), nout); + // backprop to previous layer + nflat = dot(nout, Wh2o.T()); + npool = reshape(nflat, npool.shape_); + // backprop pooling layer + nhiddenbak = unpool(nhiddenbak, npoolbak, npool, psize, psize, psize); + // calculate gradient of relu layer + nhidden = F(nhidden) * nhiddenbak; + // calc grad of layer 1 + g_hbias = sumall_except_dim<1>(nhidden); + ConvBackWard(nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst); + } + // update weight + virtual void Update(void) { + // run SGD + const float eta = 0.1; + const float wd = 0.00001; // update weight - virtual void Update( void ){ - // run SGD - const float eta = 0.1; - const float wd = 0.00001; - // update weight - Ki2h -= eta * ( wd * Ki2h + g_Ki2h ); - Wh2o -= eta * ( wd * Wh2o + g_Wh2o ); - // no regularization for bias - hbias-= eta * g_hbias; - obias-= eta * g_obias; - } -private: - // forward convolution, tmp_col and tmp_dst are helper structure - inline static void ConvForward( const Tensor &in, const Tensor &kernel, Tensor &out, - int ksize, int kstride, - TensorContainer &tmp_col, TensorContainer& tmp_dst ){ - index_t oheight = (in.shape[1] - ksize)/kstride + 1; - index_t owidth = (in.shape[0] - ksize)/kstride + 1; - index_t nbatch = in.shape[3]; - index_t nchannel = out.shape[2]; - // we directly unpack all local patches and do a dot product - // this cost lots of memory, normally for large image, only unpack several image at a time - tmp_col.Resize( Shape2( in.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); - tmp_dst.Resize( Shape2( nchannel, nbatch*oheight*owidth ) ); - // unpack local patches , stride=1 - tmp_col = unpack_patch2col( in, ksize, kstride ); - tmp_dst = dot( kernel, tmp_col ); - // reshape, then swap axis, we chain equations together - out = swapaxis<2,3>( reshape( tmp_dst, Shape4( nchannel, nbatch, oheight, owidth ) ) ); - } - - // backward convolution, calculate gradient of kernel, and backprop back to in - inline static void ConvBackWard( const Tensor &out, const Tensor &kernel, - Tensor &g_kernel, Tensor &in, - int ksize, int kstride, - TensorContainer &tmp_col, TensorContainer& tmp_dst ){ - index_t oheight = (in.shape[1] - ksize)/kstride + 1; - index_t owidth = (in.shape[0] - ksize)/kstride + 1; - index_t nbatch = in.shape[3]; - index_t nchannel = out.shape[2]; - // we directly unpack all local patches and do a dot product - // this cost lots of memory, normally for large image, only unpack several image at a time - tmp_col.Resize( Shape2( in.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); - tmp_dst.Resize( Shape2( nchannel, nbatch*oheight*owidth ) ); - // unpack local patches - tmp_col = unpack_patch2col( in, ksize, kstride ); - tmp_dst = reshape( swapaxis<2,3>( out ), tmp_dst.shape ); - g_kernel = dot( tmp_dst, tmp_col.T() ); + Ki2h -= eta * (wd * Ki2h + g_Ki2h); + Wh2o -= eta * (wd * Wh2o + g_Wh2o); + // no regularization for bias + hbias-= eta * g_hbias; + obias-= eta * g_obias; + } + private: + // forward convolution, tmp_col and tmp_dst are helper structure + inline static void ConvForward(const Tensor &in, + const Tensor &kernel, + Tensor &out, + int ksize, int kstride, + TensorContainer &tmp_col, + TensorContainer &tmp_dst) { + index_t oheight = (in.size(2) - ksize)/kstride + 1; + index_t owidth = (in.size(3) - ksize)/kstride + 1; + index_t nbatch = in.size(0); + index_t nchannel = out.size(1); + // we directly unpack all local patches and do a dot product + // this cost lots of memory, normally for large image, only unpack several image at a time + tmp_col.Resize(Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth)); + tmp_dst.Resize(Shape2(nchannel, nbatch*oheight*owidth)); + // unpack local patches , stride=1 + tmp_col = unpack_patch2col(in, ksize, ksize, kstride); + tmp_dst = dot(kernel, tmp_col); + // reshape, then swap axis, we chain equations together + out = swapaxis<1,0>(reshape(tmp_dst, Shape4(nchannel, nbatch, oheight, owidth))); + } + // backward convolution, calculate gradient of kernel, and backprop back to in + inline static void ConvBackWard(const Tensor &out, + const Tensor &kernel, + Tensor &g_kernel, + Tensor &in, + int ksize, int kstride, + TensorContainer &tmp_col, + TensorContainer &tmp_dst) { + index_t oheight = (in.size(2) - ksize)/kstride + 1; + index_t owidth = (in.size(3) - ksize)/kstride + 1; + index_t nbatch = in.size(0); + index_t nchannel = out.size(1); + // we directly unpack all local patches and do a dot product + // this cost lots of memory, normally for large image, only unpack several image at a time + tmp_col.Resize(Shape2(in.size(1) * ksize * ksize, + nbatch * oheight * owidth)); + tmp_dst.Resize(Shape2(nchannel, nbatch * oheight * owidth)); + // unpack local patches + tmp_col = unpack_patch2col(in, ksize, ksize, kstride); + tmp_dst = reshape(swapaxis<1,0>(out), tmp_dst.shape_); + g_kernel = dot(tmp_dst, tmp_col.T()); // backpropgation: not necessary for first layer, but included anyway - tmp_col = dot( kernel.T(), tmp_dst ); - in = pack_col2patch( tmp_col, in.shape, ksize, kstride ); - } -private: - // random seed generator - Random rnd; - // kernel size, pooling size - int ksize, kstride, psize; - // nodes in neural net - TensorContainer ninput, nhidden, nhiddenbak, npool, npoolbak; - TensorContainer nflat, nout; - // temp helper structure - TensorContainer tmp_col, tmp_dst; - // hidden bias, gradient - TensorContainer hbias, obias, g_hbias, g_obias; - // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize) - TensorContainer Ki2h, Wh2o, g_Ki2h, g_Wh2o; + tmp_col = dot(kernel.T(), tmp_dst); + in = pack_col2patch(tmp_col, in.shape_, ksize, ksize, kstride); + } + private: + // random seed generator + Random rnd; + // kernel size, pooling size + int ksize, kstride, psize; + // nodes in neural net + TensorContainer ninput, nhidden, nhiddenbak, npool, npoolbak; + TensorContainer nflat, nout; + // temp helper structure + TensorContainer tmp_col, tmp_dst; + // hidden bias, gradient + TensorContainer hbias, obias, g_hbias, g_obias; + // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize) + TensorContainer Ki2h, Wh2o, g_Ki2h, g_Wh2o; }; // helper function to get the max inde -inline int MaxIndex( Tensor pred ){ - int maxidx = 0; - for( index_t i = 1; i < pred.shape[0]; ++i ){ - if( pred[i] > pred[maxidx] ) maxidx = (int)i; - } - return maxidx; +inline int MaxIndex(Tensor pred) { + int maxidx = 0; + for (index_t i = 1; i < pred.size(0); ++i) { + if(pred[i] > pred[maxidx]) maxidx = (int)i; + } + return maxidx; } -int main( int argc, char *argv[] ){ - if( argc < 2 ){ - printf("Usage: cpu or gpu\n"); return 0; +int main(int argc, char *argv[]) { + if(argc < 2) { + printf("Usage: cpu or gpu\n"); return 0; + } + srand(0); + InitTensorEngine(); + + // settings + int batch_size = 100; + int insize = 28; + int nchannel = 10; + int ksize = 5; + int kstride = 1; + int psize = 2; + int num_out = 10; + + // choose which version to use + INNet *net; + if(!strcmp(argv[1], "gpu")) { + net = new ConvNet(batch_size, insize, nchannel, ksize, kstride, psize, num_out); + }else{ + net = new ConvNet(batch_size, insize, nchannel, ksize, kstride, psize, num_out); + } + + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(batch_size, num_out)); + + // label + std::vector ytrain, ytest; + // data + TensorContainer xtrain_, xtest_; + LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true); + LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false); + + TensorContainer xtrain(Shape4(xtrain_.size(1), 1, insize, insize)); + TensorContainer xtest(Shape4(xtest_.size(1), 1, insize, insize)); + xtrain = reshape(xtrain_, xtrain.shape_); + xtest = reshape(xtest_, xtest.shape_); + + int num_iter = 20; + + for (int i = 0; i < num_iter; ++ i) { + // training + for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) { + net->Forward(xtrain.Slice(j, j + batch_size), pred); + // set gradient into pred + for (int k = 0; k < batch_size; ++ k) { + pred[k][ ytrain[k+j] ] -= 1.0f; + } + // scale gradient by batchs zie + pred *= 1.0f / batch_size; + // run backprop + net->Backprop(pred); + // update net parameters + net->Update(); } - srand(0); - InitTensorEngine(); - - // settings - int batch_size = 100; - int insize = 28; - int nchannel = 10; - int ksize = 5; - int kstride = 1; - int psize = 2; - int num_out = 10; - - // choose which version to use - INNet *net; - if( !strcmp( argv[1], "gpu") ) { - net = new ConvNet( batch_size, insize, nchannel, ksize, kstride, psize, num_out ); - }else{ - net = new ConvNet( batch_size, insize, nchannel, ksize, kstride, psize, num_out ); + // evaluation + long nerr = 0; + for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) { + net->Forward(xtest.Slice(j, j + batch_size), pred); + for (int k = 0; k < batch_size; ++ k) { + nerr += MaxIndex(pred[k]) != ytest[j+k]; + } } - - // temp output layer - TensorContainer pred; - pred.Resize( Shape2( batch_size, num_out ) ); - - // label - std::vector ytrain, ytest; - // data - TensorContainer xtrain_, xtest_; - LoadMNIST( "train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true); - LoadMNIST( "t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false); - - TensorContainer xtrain( Shape4(xtrain_.shape[1], 1, insize, insize) ); - TensorContainer xtest( Shape4(xtest_.shape[1], 1, insize, insize) ); - xtrain = reshape( xtrain_, xtrain.shape ); - xtest = reshape( xtest_, xtest.shape ); - - int num_iter = 20; - - for( int i = 0; i < num_iter; ++ i ){ - // training - for( index_t j = 0; j + batch_size <= xtrain.shape[3]; j += batch_size ){ - net->Forward( xtrain.Slice( j, j + batch_size ), pred ); - // set gradient into pred - for( int k = 0; k < batch_size; ++ k ){ - pred[k][ ytrain[k+j] ] -= 1.0f; - } - // scale gradient by batchs zie - pred *= 1.0f / batch_size; - // run backprop - net->Backprop( pred ); - // update net parameters - net->Update(); - } - // evaluation - long nerr = 0; - for( index_t j = 0; j + batch_size <= xtest.shape[3]; j += batch_size ){ - net->Forward( xtest.Slice( j, j + batch_size ), pred ); - for( int k = 0; k < batch_size; ++ k ){ - nerr += MaxIndex( pred[k] ) != ytest[j+k]; - - } - } - printf("round %d: test-err=%f\n", i, (float)nerr/xtest.shape[3] ); - } - delete net; - ShutdownTensorEngine(); - return 0; + printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); + } + delete net; + ShutdownTensorEngine(); + return 0; } diff --git a/example/neuralnet/nnet.cu b/example/neuralnet/nnet.cu index 75c623a68d7e..545a6e43428e 100644 --- a/example/neuralnet/nnet.cu +++ b/example/neuralnet/nnet.cu @@ -1,5 +1,6 @@ // this implements a simple two layer neural net #include +#include // header file to use mshadow #include "mshadow/tensor.h" // helper function to load mnist dataset @@ -11,18 +12,18 @@ using namespace mshadow::expr; // define sigmoid operation struct sigmoid{ - MSHADOW_XINLINE static real_t Map(real_t a) { - return 1.0f/(1.0f+expf(-a)); - } + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f/(1.0f+expf(-a)); + } }; /*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ class INNet{ -public: - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ) = 0; - virtual void Backprop( const Tensor& gradout ) = 0; - virtual void Update( void ) = 0; - virtual ~INNet(){} + public: + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) = 0; + virtual void Backprop(const Tensor& gradout) = 0; + virtual void Update(void) = 0; + virtual ~INNet() {} }; /*! @@ -30,156 +31,153 @@ public: * this implementation is device invariant */ template -class NNet : public INNet{ -public: - // initialize the network - NNet( int batch_size, int num_in, int num_hidden, int num_out ):rnd(0){ - // setup nodes - ninput.Resize( Shape2( batch_size, num_in ) ); - nhidden.Resize( Shape2( batch_size, num_hidden ) ); - nhiddenbak.Resize( nhidden.shape ); - nout.Resize( Shape2( batch_size, num_out ) ); - // setup bias - hbias.Resize( Shape1( num_hidden ) ); g_hbias.Resize( hbias.shape ); - obias.Resize( Shape1( num_out ) ); g_obias.Resize( obias.shape ); - hbias = 0.0f; obias = 0.0f; - // setup weights - Wi2h.Resize( Shape2( num_in, num_hidden ) ); g_Wi2h.Resize( Wi2h.shape ); - Wh2o.Resize( Shape2( num_hidden, num_out ) ); g_Wh2o.Resize( Wh2o.shape ); - rnd.SampleGaussian( Wi2h, 0, 0.01f ); - rnd.SampleGaussian( Wh2o, 0, 0.01f ); - - } - virtual ~NNet(){} - // forward propagation - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ){ - // note: in mshadow, shape[0] means lowest dimension, shape[1] is number of rows in matrix - // this is different from numpy convention - index_t batch_size = inbatch.shape[1]; - // copy data to input layer - Copy( ninput, inbatch ); - // first layer, fullc - nhidden = dot( ninput, Wi2h ); - nhidden+= repmat( hbias, batch_size ); - // activation, sigmloid, backup activation in nhidden - nhidden = F( nhidden ); - Copy( nhiddenbak, nhidden ); - // second layer fullc - nout = dot( nhiddenbak, Wh2o ); - nout += repmat( obias, batch_size ); - // softmax calculation - Softmax( nout, nout ); - // copy result out - Copy( oubatch, nout ); - } - // back propagation - virtual void Backprop( const Tensor& gradout ){ - // copy gradient to output layer - Copy( nout, gradout ); - // calc grad of layer 2 - g_obias = sum_rows( nout ); - g_Wh2o = dot( nhiddenbak.T(), nout ); - // backprop to layer 1 - nhiddenbak = dot( nout, Wh2o.T() ); - // calculate gradient of sigmoid layer - nhidden = nhidden * (1.0f-nhidden) * nhiddenbak; - // calc grad of layer 1 - g_hbias = sum_rows( nhidden ); - g_Wi2h = dot( ninput.T(), nhidden ); - } +class NNet : public INNet { + public: + // initialize the network + NNet(int batch_size, int num_in, int num_hidden, int num_out) : rnd(0) { + // setup nodes + ninput.Resize(Shape2(batch_size, num_in)); + nhidden.Resize(Shape2(batch_size, num_hidden)); + nhiddenbak.Resize(nhidden.shape_); + nout.Resize(Shape2(batch_size, num_out)); + // setup bias + hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_); + obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_); + hbias = 0.0f; obias = 0.0f; + // setup weights + Wi2h.Resize(Shape2(num_in, num_hidden)); g_Wi2h.Resize(Wi2h.shape_); + Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_); + rnd.SampleGaussian(&Wi2h, 0, 0.01f); + rnd.SampleGaussian(&Wh2o, 0, 0.01f); + } + virtual ~NNet() {} + // forward propagation + virtual void Forward(const Tensor& inbatch, + Tensor &oubatch) { + // size is same conventsion as numpy + index_t batch_size = inbatch.size(0); + // copy data to input layer + Copy(ninput, inbatch); + // first layer, fullc + nhidden = dot(ninput, Wi2h); + nhidden+= repmat(hbias, batch_size); + // activation, sigmloid, backup activation in nhidden + nhidden = F(nhidden); + Copy(nhiddenbak, nhidden); + // second layer fullc + nout = dot(nhiddenbak, Wh2o); + nout += repmat(obias, batch_size); + // softmax calculation + Softmax(nout, nout); + // copy result out + Copy(oubatch, nout); + } + // back propagation + virtual void Backprop(const Tensor& gradout) { + // copy gradient to output layer + Copy(nout, gradout); + // calc grad of layer 2 + g_obias = sum_rows(nout); + g_Wh2o = dot(nhiddenbak.T(), nout); + // backprop to layer 1 + nhiddenbak = dot(nout, Wh2o.T()); + // calculate gradient of sigmoid layer + nhidden = nhidden * (1.0f-nhidden) * nhiddenbak; + // calc grad of layer 1 + g_hbias = sum_rows(nhidden); + g_Wi2h = dot(ninput.T(), nhidden); + } + // update weight + virtual void Update(void) { + // run SGD + const float eta = 0.8; + const float wd = 0.00001; // update weight - virtual void Update( void ){ - // run SGD - const float eta = 0.8; - const float wd = 0.00001; - // update weight - Wi2h -= eta * ( wd * Wi2h + g_Wi2h ); - Wh2o -= eta * ( wd * Wh2o + g_Wh2o ); - // no regularization for bias - hbias-= eta * g_hbias; - obias-= eta * g_obias; - } -private: - // random seed generator - Random rnd; - // nodes in neural net - TensorContainer ninput, nhidden, nhiddenbak, nout; - // hidden bias, gradient - TensorContainer hbias, obias, g_hbias, g_obias; - // weight gradient - TensorContainer Wi2h, Wh2o, g_Wi2h, g_Wh2o; + Wi2h -= eta * (wd * Wi2h + g_Wi2h); + Wh2o -= eta * (wd * Wh2o + g_Wh2o); + // no regularization for bias + hbias-= eta * g_hbias; + obias-= eta * g_obias; + } + private: + // random seed generator + Random rnd; + // nodes in neural net + TensorContainer ninput, nhidden, nhiddenbak, nout; + // hidden bias, gradient + TensorContainer hbias, obias, g_hbias, g_obias; + // weight gradient + TensorContainer Wi2h, Wh2o, g_Wi2h, g_Wh2o; }; - // helper function to get the max inde -inline int MaxIndex( Tensor pred ){ - int maxidx = 0; - for( index_t i = 1; i < pred.shape[0]; ++i ){ - if( pred[i] > pred[maxidx] ) maxidx = (int)i; - } - return maxidx; +inline int MaxIndex(Tensor pred) { + int maxidx = 0; + for(index_t i = 1; i < pred.size(0); ++i) { + if(pred[i] > pred[maxidx]) maxidx = (int)i; + } + return maxidx; } -int main( int argc, char *argv[] ){ - if( argc < 2 ){ - printf("Usage: cpu or gpu\n"); return 0; +int main(int argc, char *argv[]) { + if(argc < 2) { + printf("Usage: cpu or gpu\n"); return 0; + } + srand(0); + InitTensorEngine(); + + // settings + int batch_size = 100; + int num_in = 28 * 28; + int num_hidden = 100; + int num_out = 10; + // choose which version to use + INNet *net; + if(!strcmp(argv[1], "gpu")) { + net = new NNet(batch_size, num_in, num_hidden, num_out); + }else{ + net = new NNet(batch_size, num_in, num_hidden, num_out); + } + + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(batch_size, num_out)); + + // label + std::vector ytrain, ytest; + // data + TensorContainer xtrain, xtest; + LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true); + LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false); + + int num_iter = 20; + + for (int i = 0; i < num_iter; ++ i) { + // training + for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) { + net->Forward(xtrain.Slice(j, j + batch_size), pred); + // set gradient into pred + for (int k = 0; k < batch_size; ++ k) { + pred[k][ ytrain[k+j] ] -= 1.0f; + } + // scale gradient by batchs zie + pred *= 1.0f / batch_size; + // run backprop + net->Backprop(pred); + // update net parameters + net->Update(); } - srand(0); - InitTensorEngine(); - - // settings - int batch_size = 100; - int num_in = 28 * 28; - int num_hidden = 100; - int num_out = 10; - - // choose which version to use - INNet *net; - if( !strcmp( argv[1], "gpu") ) { - net = new NNet( batch_size, num_in, num_hidden, num_out ); - }else{ - net = new NNet( batch_size, num_in, num_hidden, num_out ); + // evaluation + long nerr = 0; + for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) { + net->Forward(xtest.Slice(j, j + batch_size), pred); + for (int k = 0; k < batch_size; ++ k) { + nerr += MaxIndex(pred[k]) != ytest[j+k]; + + } } - - // temp output layer - TensorContainer pred; - pred.Resize( Shape2( batch_size, num_out ) ); - - // label - std::vector ytrain, ytest; - // data - TensorContainer xtrain, xtest; - LoadMNIST( "train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true); - LoadMNIST( "t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false); - - int num_iter = 20; - - for( int i = 0; i < num_iter; ++ i ){ - // training - for( index_t j = 0; j + batch_size <= xtrain.shape[1]; j += batch_size ){ - net->Forward( xtrain.Slice( j, j + batch_size ), pred ); - // set gradient into pred - for( int k = 0; k < batch_size; ++ k ){ - pred[k][ ytrain[k+j] ] -= 1.0f; - } - // scale gradient by batchs zie - pred *= 1.0f / batch_size; - // run backprop - net->Backprop( pred ); - // update net parameters - net->Update(); - } - // evaluation - long nerr = 0; - for( index_t j = 0; j + batch_size <= xtest.shape[1]; j += batch_size ){ - net->Forward( xtest.Slice( j, j + batch_size ), pred ); - for( int k = 0; k < batch_size; ++ k ){ - nerr += MaxIndex( pred[k] ) != ytest[j+k]; - - } - } - printf("round %d: test-err=%f\n", i, (float)nerr/xtest.shape[1] ); - } - delete net; - ShutdownTensorEngine(); - return 0; + printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); + } + delete net; + ShutdownTensorEngine(); + return 0; } diff --git a/example/neuralnet/util.h b/example/neuralnet/util.h index 50bcef3fdd90..f58203c7667a 100644 --- a/example/neuralnet/util.h +++ b/example/neuralnet/util.h @@ -4,9 +4,11 @@ #include #include "mshadow/tensor.h" +typedef float real_t; + using namespace mshadow; -int pack( unsigned char zz[4] ){ +int pack(unsigned char zz[4]){ return (int)(zz[3]) | (((int)(zz[2])) << 8) | (((int)(zz[1])) << 16) @@ -14,69 +16,71 @@ int pack( unsigned char zz[4] ){ } template -inline void shuffle( T *data, size_t sz ){ - if( sz == 0 ) return; - for( size_t i = sz - 1; i > 0; i-- ){ - std::swap( data[i], data[ rand() % ( i+1 ) ] ); - } +inline void shuffle(T *data, size_t sz){ + if(sz == 0) return; + for(size_t i = sz - 1; i > 0; i--){ + std::swap(data[i], data[rand() % (i+1)]); + } } // random shuffle the data inside, require PRNG template -inline void shuffle( std::vector &data ){ - shuffle( &data[0], data.size() ); +inline void shuffle(std::vector &data){ + shuffle(&data[0], data.size()); } // simple function to load in mnist -inline void LoadMNIST( const char *path_img, const char *path_label, - std::vector& ylabel, TensorContainer& xdata, bool do_shuffle ){ - // load in data - FILE *fi = fopen( path_img, "rb" ); - if( fi == NULL ){ - printf("cannot open %s\n", path_img ); - exit(-1); - } - unsigned char zz[4]; - unsigned char *t_data, *l_data; - int num_image, width, height, nlabel; - assert( fread(zz, 4 , 1, fi ) ); - assert( fread(zz, 4 , 1, fi ) ); - num_image = pack( zz ); - assert( fread(zz, 4 , 1, fi ) ); - width = pack( zz ); - assert( fread(zz, 4 , 1, fi ) ); - height = pack( zz ); - - int step = width * height; - t_data = new unsigned char[ num_image * step ]; - assert( fread( t_data, step*num_image , 1 , fi ) ); - fclose( fi ); - - // load in label - fi = fopen( path_label, "rb" ); - assert( fread(zz, 4 , 1, fi ) ); - assert( fread(zz, 4 , 1, fi ) ); - nlabel = pack( zz ); - assert( num_image == nlabel ); - l_data = new unsigned char[ num_image ]; - assert( fread( l_data, num_image , 1 , fi ) ); - // try to do shuffle - std::vector rindex; - for( int i = 0; i < num_image; ++ i ){ - rindex.push_back( i ); - } - if( do_shuffle ){ - shuffle( rindex ); - } - - // save out result - ylabel.resize( num_image ); - xdata.Resize( Shape2( num_image, width * height ) ); - for( int i = 0 ; i < num_image ; ++i ){ - for( int j = 0; j < step; ++j ) { - xdata[ i ][ j ] = (float)(t_data[ rindex[i]*step + j ]) / 256.0f; - } - ylabel[ i ] = l_data[ rindex[i] ]; - } - delete[] t_data; delete [] l_data; - printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle ); +inline void LoadMNIST(const char *path_img, const char *path_label, + std::vector &ylabel, + TensorContainer &xdata, + bool do_shuffle){ + // load in data + FILE *fi = fopen(path_img, "rb"); + if (fi == NULL) { + printf("cannot open %s\n", path_img); + exit(-1); + } + unsigned char zz[4]; + unsigned char *t_data, *l_data; + int num_image, width, height, nlabel; + assert(fread(zz, 4 , 1, fi)); + assert(fread(zz, 4 , 1, fi)); + num_image = pack(zz); + assert(fread(zz, 4 , 1, fi)); + width = pack(zz); + assert(fread(zz, 4 , 1, fi)); + height = pack(zz); + + int step = width * height; + t_data = new unsigned char[num_image * step]; + assert(fread(t_data, step*num_image , 1 , fi)); + fclose(fi); + + // load in label + fi = fopen(path_label, "rb"); + assert(fread(zz, 4 , 1, fi)); + assert(fread(zz, 4 , 1, fi)); + nlabel = pack(zz); + assert(num_image == nlabel); + l_data = new unsigned char[num_image]; + assert(fread(l_data, num_image , 1 , fi)); + // try to do shuffle + std::vector rindex; + for (int i = 0; i < num_image; ++ i) { + rindex.push_back(i); + } + if (do_shuffle) { + shuffle(rindex); + } + + // save out result + ylabel.resize(num_image); + xdata.Resize(Shape2(num_image, width * height)); + for (int i = 0 ; i < num_image ; ++i) { + for(int j = 0; j < step; ++j) { + xdata[i][j] = (float)(t_data[rindex[i]*step + j]) / 256.0f; + } + ylabel[i] = l_data[rindex[i]]; + } + delete[] t_data; delete [] l_data; + printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle); } diff --git a/mshadow/base.h b/mshadow/base.h index e5be86aa4bf3..5ccd0415a062 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -301,7 +301,7 @@ struct maximum { /*! \brief do reduction into dst */ template MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { - using std::max; + using namespace std; dst = max(dst, src); } /*! diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 5db82d88b701..b9e96a46fa89 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -198,7 +198,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { } for (unsigned x = x_size; x < xmax; x += x_size) { if (x + threadIdx.x < xmax) { - DType a = src[y][x + threadIdx.x]; + DType a = src.Eval(y, x + threadIdx.x); s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); } } @@ -239,11 +239,13 @@ template inline void Softmax(Tensor &dst, const Tensor &src) { dim3 dimBlock(kBaseThreadNum); - dim3 dimGrid(dst.shape[0]); + dim3 dimGrid(dst.size(0)); utils::Check(dst.shape_ == src.shape_, "Softmax: shape mismatch"); CheckLaunchParam(dimGrid, dimBlock, "Softmax"); SoftmaxKernel - <<>>(expr::MakePlan(dst), expr::MakePlan(src)); + <<>>(expr::MakePlan(dst), + expr::MakePlan(src), + dst.size(1)); } } // namespace cuda } // namespace mshadow diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h index 6b0b32553332..0b4b3cbf8a2b 100644 --- a/mshadow/extension/channel_pool.h +++ b/mshadow/extension/channel_pool.h @@ -66,7 +66,7 @@ struct Plan, DType> { height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]), hnsize_(e.nsize_ / 2) {} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { - using std::min; + using namespace std; const index_t y = i % height_; i /= height_; const index_t c = i % channel_; diff --git a/mshadow/extension/pack_col2patch.h b/mshadow/extension/pack_col2patch.h index cc8843f37916..73d378e0a257 100644 --- a/mshadow/extension/pack_col2patch.h +++ b/mshadow/extension/pack_col2patch.h @@ -85,7 +85,7 @@ struct Plan, DType> { // note: i/o convention are same as unpack } MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { - using std::min; + using namespace std; const index_t y = i % i_height_; const index_t idivh = i / i_height_; const index_t c = idivh % i_channel_; diff --git a/mshadow/extension/spatial_pool.h b/mshadow/extension/spatial_pool.h index aa0435fdab0f..07f8433cca27 100644 --- a/mshadow/extension/spatial_pool.h +++ b/mshadow/extension/spatial_pool.h @@ -117,7 +117,7 @@ struct Plan, DType> { src_height_(e.src_height_), src_width_(e.src_width_), new_height_(e.shape_[srcdim - 2]) {} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { - using std::min; + using namespace std; const index_t py = i % new_height_; const index_t y_start = py * kstride_; const index_t y_end = min(y_start + ksize_y_, src_height_); diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h index 092819c32697..441e04b3b815 100644 --- a/mshadow/extension/spatial_unpool.h +++ b/mshadow/extension/spatial_unpool.h @@ -95,7 +95,7 @@ struct Plan, DType> { pshape_y_(e.pshape_y_), pshape_x_(e.pshape_x_), ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_) {} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { - using std::min; + using namespace std; const index_t x = j; const index_t y = i % sshape_y_; const index_t c = i / sshape_y_; diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 8f5fb185e2ad..0e87ea245df0 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -302,7 +302,7 @@ struct Tensor: public TRValue, */ MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { - Shape s = this->shape; + Shape s = this->shape_; s[0] = end - begin; return Tensor(dptr_ + this->MSize<1>() * begin, s, stride_); diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index 42542d7490da..e2aba78e4186 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -39,7 +39,7 @@ class TensorContainer: public Tensor { */ explicit TensorContainer(const Shape &shape) { this->pad_ = MSHADOW_ALLOC_PAD; - data_.dptr = NULL; + data_.dptr_ = NULL; this->AllocByShape(shape); } /*! diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 70e3c8567542..56f48a300a7f 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -43,7 +43,7 @@ inline void FreeSpace(Tensor *obj) { template inline void Copy(Tensor _dst, const Tensor &_src) { - utils::Check(_dst.shape == _src.shape, "Copy:shape mismatch"); + utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); Tensor dst = _dst.FlatTo2D(); Tensor src = _src.FlatTo2D(); for (index_t y = 0; y < dst.size(0); ++y) { @@ -186,7 +186,7 @@ inline void Softmax(Tensor dst, template inline void Softmax(Tensor dst, const Tensor &energy) { - utils::Check(dst.shape == energy.shape, "Softmax: shape mismatch"); + utils::Check(dst.shape_ == energy.shape_, "Softmax: shape mismatch"); for (index_t y = 0; y < dst.size(0); ++y) { Softmax(dst[y], energy[y]); } @@ -195,7 +195,7 @@ inline void Softmax(Tensor dst, template inline DType VDot(const Tensor &lhs, const Tensor &rhs) { - utils::Check(lhs.shape == rhs.shape, "VDot: shape mismatch"); + utils::Check(lhs.shape_ == rhs.shape_, "VDot: shape mismatch"); DType sum = static_cast(0); for (index_t x = 0; x < lhs.size(0); ++x) { sum += lhs[x] * rhs[x]; diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index f79fe82d2863..39d16090239c 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -67,7 +67,7 @@ inline void AllocSpace(Tensor *obj, bool pad) { } template inline void FreeSpace(Tensor *obj) { - cudaFree(obj->dptr_); obj->dptr = NULL; + cudaFree(obj->dptr_); obj->dptr_ = NULL; } template inline void Copy(Tensor _dst, From 6331506aa97cee9adcf96454032876fd8e79e2a8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 27 Dec 2014 21:09:41 -0800 Subject: [PATCH 039/147] afix cpu compile --- mshadow/tensor_gpu-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 39d16090239c..f3eade7aee9c 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -160,7 +160,7 @@ inline void Softmax(Tensor dst, const Tensor& src) { cuda::Softmax(dst, src); } +} // namespace mshadow #endif // __CUDACC__ #endif // MSHADOW_USE_CUDA -} // namespace mshadow #endif // MSHADOW_TENSOR_GPU_INL_H_ From de6826c316df3897193dba702e0ec648ac1ee758 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sat, 27 Dec 2014 22:12:50 -0700 Subject: [PATCH 040/147] chg --- example/defop.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/example/defop.cpp b/example/defop.cpp index e1b1b311bb9a..d78616213353 100644 --- a/example/defop.cpp +++ b/example/defop.cpp @@ -3,7 +3,7 @@ #include "mshadow/tensor.h" // this namespace contains all data structures, functions using namespace mshadow; -// this namespace contains all operator overloads +// this namespace contains all operator overloads using namespace mshadow::expr; // user defined unary operator addone @@ -25,13 +25,13 @@ struct maxoftwo{ int main(void){ // intialize tensor engine before using tensor operation, needed for CuBLAS //InitTensorEngine(); - // take first subscript of the tensor - Tensor mat = NewTensor(Shape2(2,3), 0.0f); - Tensor mat2= NewTensor(Shape2(2,3), 0.0f); - + // take first subscript of the tensor + Tensor mat = NewTensor(Shape2(2,3), 0.0f); + Tensor mat2= NewTensor(Shape2(2,3), 0.0f); + mat[0][0] = -2.0f; mat = F(F(mat) + 1.0f, mat2); - + for(index_t i = 0; i < mat.size(0); ++i){ for(index_t j = 0; j < mat.size(1); ++j){ printf("%.2f ", mat[i][j]); From 86bbf1e3196bad9fab5ef6cae69e2014df7be503 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sun, 28 Dec 2014 17:48:33 -0700 Subject: [PATCH 041/147] test group 0 --- test/Makefile | 33 +++++++++++++++++++++++++ test/test.cu | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ test/test.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 test/Makefile create mode 100644 test/test.cu create mode 100644 test/test.h diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 000000000000..061b99b2e119 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,33 @@ +# set LD_LIBRARY_PATH +export CC = gcc +export CXX = g++ +export NVCC =nvcc +export CFLAGS = -Wall -O3 -g -msse3 -Wno-unknown-pragmas -funroll-loops -I../ +export LDFLAGS= -g -lm -lcublas -lcudart +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) + +# specify tensor path +BIN = +OBJ = +CUOBJ = +CUBIN = test +.PHONY: clean all + +all: $(CUBIN) $(OBJ) + +test: test.cu + +$(BIN) : + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) + +$(OBJ) : + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) + +$(CUOBJ) : + $(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^) + +$(CUBIN) : + $(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^) + +clean: + $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ diff --git a/test/test.cu b/test/test.cu new file mode 100644 index 000000000000..aefd93063196 --- /dev/null +++ b/test/test.cu @@ -0,0 +1,64 @@ +#include "test.h" + +using namespace mshadow; + + +int main() { + InitTensorEngine(); + Tensor tc = NewTensor(Shape3(3, 2, 4), 0.0f); + Tensor tg = NewTensor(tc.shape_, 0.0f); + // init + for (index_t i = 0; i < tc.size(0); ++i) { + for (index_t j = 0; j < tc.size(1); ++j) { + for (index_t k = 0; k < tc.size(2); ++k) { + tc[i][j][k] = i * 0.1f + j * 0.2f + k * 0.1f; + } + } + } + Copy(tg, tc); + // print + printf("\n#print batch 0 of cpu tensor:\n"); + Print2DTensor(tc[0]); + // check + if (Check2DTensor(tg[1], tc[1])) { + printf("batch 1 of gpu & cpu tensor are same.\n"); + } + // sum of row + Tensor tmp_tc = NewTensor(Shape1(tc[0].size(1)), 0.0f); + Tensor tmp_tg = NewTensor(Shape1(tg[0].size(1)), 0.0f); + printf("\n#sum_rows of batch 0:\n"); + tmp_tc = sum_rows(tc[0]); + tmp_tg = sum_rows(tg[0]); + Print1DTensor(tmp_tc); + if (Check1DTensor(tmp_tg, tmp_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&tmp_tc); + FreeSpace(&tmp_tg); + // sumall_except_dim: Waring, random fail! + printf("\n#sumall_except_dim<0> of batch 0:\n"); + Tensor red_tc = NewTensor(Shape1(2), 0.0f); + Tensor red_tg = NewTensor(Shape1(2), 0.0f); + red_tc = sumall_except_dim<0>(tc[0]); + red_tg = sumall_except_dim<0>(tg[0]); + Print1DTensor(red_tc); + if (Check1DTensor(red_tg, red_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&red_tc); + FreeSpace(&red_tg); + // softmax + printf("\n#Softmax\n"); + Tensor sm_tc = NewTensor(tc[0].shape_, 0.0f); + Tensor sm_tg = NewTensor(tg[0].shape_, 0.0f); + Softmax(sm_tc, tc[0]); + Softmax(sm_tg, tg[0]); + FreeSpace(&sm_tc); + FreeSpace(&sm_tg); + if (Check2DTensor(sm_tg, sm_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&tc); + FreeSpace(&tg); + ShutdownTensorEngine(); +} diff --git a/test/test.h b/test/test.h new file mode 100644 index 000000000000..1b2a955558e2 --- /dev/null +++ b/test/test.h @@ -0,0 +1,67 @@ +#ifndef TEST_H +#define TEST_H + +#include "mshadow/tensor.h" +#include "assert.h" + +#define EPS 0.0001 +using namespace mshadow; +using namespace mshadow::expr; + + +template +void Print2DTensor(Tensor const &ts); + +template +void Print1DTensor(Tensor const &ts); + +template<> +void Print1DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + printf("%.2f ", ts[i]); + } + printf("\n"); +} + + +template<> +void Print2DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + Print1DTensor(ts[i]); + } +} + +template<> +void Print2DTensor(Tensor const &tg) { + Tensor tc = NewTensor(tg.shape_, 0.0f); + Copy(tc, tg); + Print2DTensor(tc); + FreeSpace(&tc); +} + + + +bool Check2DTensor(Tensor const &tg, Tensor const &tc) { + Tensor tcc = NewTensor(tg.shape_, 0.0f); + Copy(tcc, tg); + for (index_t i = 0; i < tc.size(0); ++i) { + for (index_t j = 0; j < tc.size(1); ++j) { + assert(abs(tcc[i][j] - tc[i][j]) < EPS); + } + } + FreeSpace(&tcc); + return true; +} + +bool Check1DTensor(Tensor const &tg, Tensor const &tc) { + Tensor tcc = NewTensor(tc.shape_, 0.0f); + Copy(tcc, tg); + // printf("gpu result:\n"); + // Print1DTensor(tcc); + for (index_t i = 0; i < tc.size(0); ++i) { + assert(abs(tcc[i] - tc[i]) < EPS); + } + FreeSpace(&tcc); + return true; +} +#endif From d7d7ed641ca6df327e8329a9738a10e3542f198b Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sun, 28 Dec 2014 20:46:35 -0700 Subject: [PATCH 042/147] chg --- test/test.cu | 27 ++++++++++++++++++++------- test/test.h | 4 ++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/test/test.cu b/test/test.cu index aefd93063196..37fe7e76cbd0 100644 --- a/test/test.cu +++ b/test/test.cu @@ -19,6 +19,10 @@ int main() { // print printf("\n#print batch 0 of cpu tensor:\n"); Print2DTensor(tc[0]); + printf("\n"); + Print2DTensor(tc[1]); + printf("\n"); + Print2DTensor(tc[2]); // check if (Check2DTensor(tg[1], tc[1])) { printf("batch 1 of gpu & cpu tensor are same.\n"); @@ -35,12 +39,12 @@ int main() { } FreeSpace(&tmp_tc); FreeSpace(&tmp_tg); - // sumall_except_dim: Waring, random fail! + // sumall_except_dim printf("\n#sumall_except_dim<0> of batch 0:\n"); - Tensor red_tc = NewTensor(Shape1(2), 0.0f); - Tensor red_tg = NewTensor(Shape1(2), 0.0f); - red_tc = sumall_except_dim<0>(tc[0]); - red_tg = sumall_except_dim<0>(tg[0]); + Tensor red_tc = NewTensor(Shape1(tc.size(0)), 0.0f); + Tensor red_tg = NewTensor(Shape1(tg.size(0)), 0.0f); + red_tc = sumall_except_dim<0>(tc); + red_tg = sumall_except_dim<0>(tg); Print1DTensor(red_tc); if (Check1DTensor(red_tg, red_tc)) { printf("cpu & gpu result consists\n"); @@ -53,11 +57,20 @@ int main() { Tensor sm_tg = NewTensor(tg[0].shape_, 0.0f); Softmax(sm_tc, tc[0]); Softmax(sm_tg, tg[0]); - FreeSpace(&sm_tc); - FreeSpace(&sm_tg); if (Check2DTensor(sm_tg, sm_tc)) { printf("cpu & gpu result consists\n"); } + // mirror + printf("\n#mirror\n"); + sm_tc = mirror(tc[0]); + sm_tg = mirror(tg[0]); + if (Check2DTensor(sm_tg, sm_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&sm_tc); + FreeSpace(&sm_tg); + // reshape + FreeSpace(&tc); FreeSpace(&tg); ShutdownTensorEngine(); diff --git a/test/test.h b/test/test.h index 1b2a955558e2..2cfc515957ca 100644 --- a/test/test.h +++ b/test/test.h @@ -56,8 +56,8 @@ bool Check2DTensor(Tensor const &tg, Tensor const bool Check1DTensor(Tensor const &tg, Tensor const &tc) { Tensor tcc = NewTensor(tc.shape_, 0.0f); Copy(tcc, tg); - // printf("gpu result:\n"); - // Print1DTensor(tcc); + printf("gpu result:\n"); + Print1DTensor(tcc); for (index_t i = 0; i < tc.size(0); ++i) { assert(abs(tcc[i] - tc[i]) < EPS); } From e2035a5ab7a1c007ec5c7fdee69531666d16393e Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 28 Dec 2014 19:48:24 -0800 Subject: [PATCH 043/147] fix sumall --- example/basic.cpp | 2 ++ mshadow/tensor_cpu-inl.h | 2 +- mshadow/tensor_gpu-inl.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/example/basic.cpp b/example/basic.cpp index 79194e39a157..2bae1a42729f 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -17,7 +17,9 @@ int main(void) { // Tensor object is only a handle, assignment means they have same data content Tensor mat2= NewTensor(Shape1(2), 0.0f); Tensor ts1= NewTensor(ts.shape_, 0.0f); + Random rnd(0); mat2[1] = 10; + mat2 = rnd.uniform(mat2.shape_); // shape of matrix, note shape order is different from numpy // shape[i] indicate the shape of i-th dimension printf("%u X %u matrix, stride=%u\n", mat.size(0), mat.size(1), mat.stride_); diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 56f48a300a7f..62951f14aa45 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -146,7 +146,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, // use equvalent form Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), eshape[dimkeep], - eshape.ProdShape(dimkeep, EShape::kSubdim), + eshape.ProdShape(dimkeep + 1, EShape::kSubdim), eshape[EShape::kSubdim]); // execution expr::Plan dplan = MakePlan(dst->self()); diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index f3eade7aee9c..52dee8768012 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -149,7 +149,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, // use equvalent form Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), eshape[dimkeep], - eshape.ProdShape(dimkeep, EShape::kSubdim), + eshape.ProdShape(dimkeep + 1, EShape::kSubdim), eshape[EShape::kSubdim]); // call equavalent map red dim 2 cuda::MapReduceKeepDim1 From 81b73e20aeb050c5b532b40e35abe1ac34c0ed98 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 28 Dec 2014 20:13:09 -0800 Subject: [PATCH 044/147] fix shape4 --- mshadow/dot_engine-inl.h | 2 +- mshadow/tensor.h | 10 +++++----- mshadow/tensor_gpu-inl.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h index 2d0f9e240627..38c05da757b0 100644 --- a/mshadow/dot_engine-inl.h +++ b/mshadow/dot_engine-inl.h @@ -115,7 +115,7 @@ struct BLASEngine { cublasDger(m, n, alpha, X, incX, Y, incY, A, lda); } }; -#endif +#endif // MSHADOW_USE_CUDA // helper function to decide which shape we are in inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { return transpose ? Shape2(shape[0], shape[1]) : shape; diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 0e87ea245df0..149eae19a1ff 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -179,14 +179,14 @@ MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) { } /*! * \brief construct a four dimension shape, stride will equal s0 - * \param s3 size of dimension 3 - * \param s2 size of dimension 2 - * \param s1 size of dimension 1 * \param s0 size of dimension 0 + * \param s1 size of dimension 1 + * \param s2 size of dimension 2 + * \param s3 size of dimension 3 * \return the shape construction */ -MSHADOW_XINLINE Shape<4> Shape4(index_t s3, index_t s2, - index_t s1, index_t s0) { +MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1, + index_t s2, index_t s3) { Shape<4> s; s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; return s; diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 52dee8768012..f6d91d67b83a 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -160,7 +160,7 @@ inline void Softmax(Tensor dst, const Tensor& src) { cuda::Softmax(dst, src); } -} // namespace mshadow #endif // __CUDACC__ #endif // MSHADOW_USE_CUDA +} // namespace mshadow #endif // MSHADOW_TENSOR_GPU_INL_H_ From 9b62f8be7261d0b4994834e35a7720ff40c5fe40 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 28 Dec 2014 20:56:45 -0800 Subject: [PATCH 045/147] fix conv example --- example/neuralnet/convnet.cu | 2 +- mshadow/base.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/neuralnet/convnet.cu b/example/neuralnet/convnet.cu index de1bd49e18c5..eb9bd4bc907a 100644 --- a/example/neuralnet/convnet.cu +++ b/example/neuralnet/convnet.cu @@ -47,7 +47,7 @@ class ConvNet : public INNet { nhiddenbak.Resize(nhidden.shape_); npool.Resize(Shape4(batch_size, nchannel, (nhidden.size(2)+1-psize)/psize, (nhidden.size(3)+1-psize)/psize)); npoolbak.Resize(npool.shape_); - nflat.Resize(Shape2(batch_size, npool.size(0)*npool.size(1)*npool.size(2))); + nflat.Resize(Shape2(batch_size, npool.size(1)*npool.size(2)*npool.size(3))); nout.Resize(Shape2(batch_size, num_out)); // setup bias hbias.Resize(Shape1(nchannel)); g_hbias.Resize(hbias.shape_); diff --git a/mshadow/base.h b/mshadow/base.h index 5ccd0415a062..b733354b3899 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -38,8 +38,8 @@ #if MSHADOW_STAND_ALONE #define MSHADOW_USE_CBLAS 0 - #define MSHADOW_USE_MKL 1 - #define MSHADOW_USE_CUDA 1 + #define MSHADOW_USE_MKL 0 + #define MSHADOW_USE_CUDA 0 #endif /*! \brief use CBLAS for CBLAS */ From 6f4c9d6a7d329227f4d2b019eeed02c59ba72194 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 28 Dec 2014 21:16:33 -0800 Subject: [PATCH 046/147] fix dot --- example/neuralnet/nnet.cu | 2 +- mshadow/base.h | 4 ++-- mshadow/dot_engine-inl.h | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/example/neuralnet/nnet.cu b/example/neuralnet/nnet.cu index 545a6e43428e..b4dea5f1d8ce 100644 --- a/example/neuralnet/nnet.cu +++ b/example/neuralnet/nnet.cu @@ -73,7 +73,7 @@ class NNet : public INNet { Copy(oubatch, nout); } // back propagation - virtual void Backprop(const Tensor& gradout) { + virtual void Backprop(const Tensor& gradout) { // copy gradient to output layer Copy(nout, gradout); // calc grad of layer 2 diff --git a/mshadow/base.h b/mshadow/base.h index b733354b3899..b29e8c57275f 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -18,7 +18,7 @@ * mshadow should compile without any of other libs */ #ifndef MSHADOW_STAND_ALONE -#define MSHADOW_STAND_ALONE 1 +#define MSHADOW_STAND_ALONE 0 #endif /*! \brief whether do padding during allocation */ #ifndef MSHADOW_ALLOC_PAD @@ -48,7 +48,7 @@ #endif /*! \brief use MKL for BLAS */ #ifndef MSHADOW_USE_MKL - #define MSHADOW_USE_MKL 0 + #define MSHADOW_USE_MKL 1 #endif /*! * \brief use CUDA support, must ensure that the cuda include path is correct, diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h index 38c05da757b0..4df71ccdbdfd 100644 --- a/mshadow/dot_engine-inl.h +++ b/mshadow/dot_engine-inl.h @@ -118,7 +118,7 @@ struct BLASEngine { #endif // MSHADOW_USE_CUDA // helper function to decide which shape we are in inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { - return transpose ? Shape2(shape[0], shape[1]) : shape; + return transpose ? Shape2(shape[1], shape[0]) : shape; } // dst = dot(lhs[.T], rhs[.T]) template { Shape<2> sleft = GetShape(lhs.shape_, transpose_left); Shape<2> sright = GetShape(rhs.shape_, transpose_right); utils::Check(dst.size(0) == sleft[0] && dst.size(1) == sright[1] \ - && sleft[1] == sright[0] , "dot-gemm: matrix shape mismatch"); + && sleft[1] == sright[0] , + "dot-gemm: matrix shape mismatch"); // use column major argument to compatible with most BLAS BLASEngine::gemm (transpose_right , transpose_left, From c4fc1c994fcc751af7f739ad72bde840fab5dd1c Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 28 Dec 2014 21:36:48 -0800 Subject: [PATCH 047/147] fix conv --- example/neuralnet/convnet.cu | 4 ++-- mshadow/cuda/tensor_gpu-inl.cuh | 2 +- mshadow/extension/pack_col2patch.h | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/example/neuralnet/convnet.cu b/example/neuralnet/convnet.cu index eb9bd4bc907a..ce2f644abbaf 100644 --- a/example/neuralnet/convnet.cu +++ b/example/neuralnet/convnet.cu @@ -224,8 +224,8 @@ int main(int argc, char *argv[]) { LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true); LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false); - TensorContainer xtrain(Shape4(xtrain_.size(1), 1, insize, insize)); - TensorContainer xtest(Shape4(xtest_.size(1), 1, insize, insize)); + TensorContainer xtrain(Shape4(xtrain_.size(0), 1, insize, insize)); + TensorContainer xtest(Shape4(xtest_.size(0), 1, insize, insize)); xtrain = reshape(xtrain_, xtrain.shape_); xtest = reshape(xtest_, xtest.shape_); diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index b9e96a46fa89..b69bd2841140 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -13,7 +13,7 @@ namespace mshadow { namespace cuda { /*! \brief seems CUDAARCH is deprecated in future NVCC */ #ifndef __CUDA_ARCH__ -//#warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" +#warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" #endif /* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 diff --git a/mshadow/extension/pack_col2patch.h b/mshadow/extension/pack_col2patch.h index 73d378e0a257..28001b42c9e6 100644 --- a/mshadow/extension/pack_col2patch.h +++ b/mshadow/extension/pack_col2patch.h @@ -41,7 +41,7 @@ struct PackColToPatchXExp: utils::Check(sshape[1] == o_height * o_width * imshape.ProdShape(0, dstdim - 3), "PackColToPatchExp: src.size(1) mismatch"); - utils::Check(sshape[0] == psize_y * psize_x * imshape[2], + utils::Check(sshape[0] == psize_y * psize_x * imshape[dstdim - 3], "PackColToPatchExp: src.size(0) mismatch"); } }; @@ -65,7 +65,8 @@ pack_col2patch(const expr::Exp &src, index_t psize_x, index_t pstride) { TypeCheckPass::kDim == 2> ::Error_Expression_Does_Not_Meet_Dimension_Req(); - utils::Check(imshape[0] >= psize_x && imshape[1] >= psize_y, + utils::Check(imshape[dstdim - 1] >= psize_x && + imshape[dstdim - 2] >= psize_y, "PackColToPatch:image shape smaller than patch size"); return PackColToPatchXExp(src.self(), imshape, psize_y, psize_x, pstride); From 064ff2c621f17ff75572689ba55febf2384c70c8 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 29 Dec 2014 16:46:15 -0700 Subject: [PATCH 048/147] pack, pool --- test/pool.cu | 69 ++++++++++++++++++++++++++++++++++++++++ test/unpack.cu | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 test/pool.cu create mode 100644 test/unpack.cu diff --git a/test/pool.cu b/test/pool.cu new file mode 100644 index 000000000000..9641d53c9c45 --- /dev/null +++ b/test/pool.cu @@ -0,0 +1,69 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist) { + for (int i = 0; i < ist.size(0); ++i) { + for (int j = 0; j < ist.size(1); ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int X = 6; + const int K = 2; + mshadow::TensorContainer srcm(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer srcx(Xmshadow::Shape2(X, X)); + + mshadow::TensorContainer mct(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer xct(Xmshadow::Shape2(X, X)); + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + srcm[i][j] = i * 0.1f + j * 0.1f; + srcx[i][j] = i * 0.1f + j * 0.1f; + } + } + mshadow::Copy(mct, srcm); + Xmshadow::Copy(xct, srcx); + mshadow::TensorContainer pool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + Xmshadow::TensorContainer pool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + + pool_ct = mshadow::expr::pool(mct, K, K, K); + pool_xct = Xmshadow::expr::pool(xct, K, K); + + mshadow::TensorContainer cpool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + Xmshadow::TensorContainer cpool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + mshadow::Copy(cpool_ct, pool_ct); + Xmshadow::Copy(cpool_xct, pool_xct); + if (Check(cpool_ct, cpool_xct)) { + printf("Pass\n"); + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file diff --git a/test/unpack.cu b/test/unpack.cu new file mode 100644 index 000000000000..dd0c2b9c5821 --- /dev/null +++ b/test/unpack.cu @@ -0,0 +1,85 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist) { + for (int i = 0; i < ist.size(0); ++i) { + for (int j = 0; j < ist.size(1); ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int ksize = 3; + const int kstride = 2; + const int X = 6; + Xmshadow::TensorContainer xsrc(Xmshadow::Shape4(1, 1, X, X)); + mshadow::TensorContainer src(mshadow::Shape4(1, 1, X, X)); + + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + xsrc[0][0][i][j] = i * 0.1f + j * 0.2f; + src[0][0][i][j] = i * 0.1f + j * 0.2f; + } + } + Xmshadow::TensorContainer xin(Xmshadow::Shape4(1, 1, X, X)); + mshadow::TensorContainer in(mshadow::Shape4(1, 1, X, X)); + + mshadow::Copy(in, src); + Xmshadow::Copy(xin, xsrc); + + Xmshadow::TensorContainer xtmp_col; + mshadow::TensorContainer tmp_col; + + + index_t oheight = (in.size(2) - ksize)/kstride + 1; + index_t owidth = (in.size(3) - ksize)/kstride + 1; + index_t nbatch = in.size(0); + + + xtmp_col.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); + tmp_col.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth)); + xtmp_col = Xmshadow::expr::unpack_patch2col( xin, ksize, kstride ); + tmp_col = mshadow::expr::unpack_patch2col(in, ksize, ksize, kstride); + + Xmshadow::TensorContainer xtc; + mshadow::TensorContainer tc; + + xtc.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); + tc.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth)); + + mshadow::Copy(tc, tmp_col); + Xmshadow::Copy(xtc, xtmp_col); + if (Check(tc, xtc)) { + printf("Pass\n"); + } + +} + +int main(int argc, char** argv) { + if (argc < 2) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file From b4483ec2c07ce5cede536d2f765068d65494132e Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 29 Dec 2014 16:46:41 -0700 Subject: [PATCH 049/147] reshape --- test/reshape.cu | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 test/reshape.cu diff --git a/test/reshape.cu b/test/reshape.cu new file mode 100644 index 000000000000..c1ad52e07c40 --- /dev/null +++ b/test/reshape.cu @@ -0,0 +1,74 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist) { + for (int i = 0; i < ist.size(0); ++i) { + for (int j = 0; j < ist.size(1); ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int X = 6; + const int K = 2; + mshadow::TensorContainer srcm(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer srcx(Xmshadow::Shape2(X, X)); + + mshadow::TensorContainer mct(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer xct(Xmshadow::Shape2(X, X)); + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + srcm[i][j] = i * 0.1f + j * 0.1f; + srcx[i][j] = i * 0.1f + j * 0.1f; + } + } + mshadow::Copy(mct, srcm); + Xmshadow::Copy(xct, srcx); + + mshadow::TensorContainer mct4d(mshadow::Shape4(1, 1, X / K, X * K)); + Xmshadow::TensorContainer xct4d(Xmshadow::Shape4(X / K, X * K, 1, 1)); + + mct4d = mshadow::expr::reshape(mct, mct4d.shape_); + xct4d = Xmshadow::expr::reshape(xct, xct4d.shape); + + mct = mshadow::expr::reshape(mct4d, mct.shape_); + xct = Xmshadow::expr::reshape(xct4d, xct.shape); + + mshadow::TensorContainer m_ct(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer x_ct(Xmshadow::Shape2(X, X)); + + mshadow::Copy(m_ct, mct); + Xmshadow::Copy(x_ct, xct); + if (Check(m_ct, x_ct)) { + printf("Pass\n"); + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file From 8885e5000ee93c3410c2c44f4cc5f48a8c47f37d Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 29 Dec 2014 21:18:24 -0700 Subject: [PATCH 050/147] unpool test --- test/pairtest.cu | 105 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 test/pairtest.cu diff --git a/test/pairtest.cu b/test/pairtest.cu new file mode 100644 index 000000000000..56b0380747a7 --- /dev/null +++ b/test/pairtest.cu @@ -0,0 +1,105 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist, int I, int J) { + for (int i = 0; i < I; ++i) { + for (int j = 0; j < J; ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int X = 6; + const int K = 2; + const int O = (X - K) / 2 + 1; + mshadow::TensorContainer srcm(mshadow::Shape4(1,1,X, X)); + Xmshadow::TensorContainer srcx(Xmshadow::Shape4(1,1,X, X)); + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + srcm[0][0][i][j] = i * 0.1f + j * 0.1f; + srcx[0][0][i][j] = i * 0.1f + j * 0.1f; + } + } + printf("Source:\n"); + Print(srcm[0][0], X, X); + printf("\n"); + mshadow::TensorContainer mct(mshadow::Shape4(1,1,X, X)); + Xmshadow::TensorContainer xct(Xmshadow::Shape4(1,1,X, X)); + mshadow::Copy(mct, srcm); + Xmshadow::Copy(xct, srcx); + + + mshadow::TensorContainer pool_ct(mshadow::Shape4(1,1, O, O)); + Xmshadow::TensorContainer pool_xct(Xmshadow::Shape4(1,1,O,O)); + + pool_ct = mshadow::expr::pool(mct, K, K, K); + pool_xct = Xmshadow::expr::pool(xct, K, K); + + printf("New pool:\n"); + Print(pool_ct[0][0], O, O); + printf("\nOld pool:\n"); + Print(pool_xct[0][0], O, O); + printf("\n"); + mshadow::TensorContainer gpool_src(mshadow::Shape4(1,1, O, O)); + Xmshadow::TensorContainer gpool_xsrc(Xmshadow::Shape4(1,1,O,O)); + for (int i = 0; i < O; ++i) { + for (int j = 0; j < O; ++j) { + gpool_src[0][0][i][j] = 0.1f; + gpool_xsrc[0][0][i][j] = 0.1f; + } + } + mshadow::TensorContainer gpool_ct(mshadow::Shape4(1,1, O, O)); + Xmshadow::TensorContainer gpool_xct(Xmshadow::Shape4(1,1,O,O)); + mshadow::Copy(gpool_ct, gpool_src); + Xmshadow::Copy(gpool_xct, gpool_xsrc); + + mshadow::TensorContainer mout(mshadow::Shape4(1,1,X, X)); + Xmshadow::TensorContainer xout(Xmshadow::Shape4(1,1,X, X)); + + mout = mshadow::expr::unpool(mct, pool_ct, gpool_ct, K, K, K); + xout = Xmshadow::expr::unpool(xct, pool_xct, gpool_xct, K, K); + + mshadow::Copy(srcm, mout); + Xmshadow::Copy(srcx, xout); + + mshadow::TensorContainer l1(mshadow::Shape2(X,X)); + Xmshadow::TensorContainer l2(Xmshadow::Shape2(X, X)); + l1 = mshadow::expr::reshape(srcm, l1.shape_); + l2 = Xmshadow::expr::reshape(srcx, l2.shape); + printf("New unpool\n"); + Print(l1, l1.size(0), l1.size(1)); + printf("\nOld unpool\n"); + Print(l2, X, X); + if (Check(l1, l2)) { + printf("Pass\n"); + } +} + +int main(int argc, char** argv) { + if (argc < 1) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file From 7e4a130935c46c9aea1423356839c33690dadb98 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 29 Dec 2014 23:38:57 -0700 Subject: [PATCH 051/147] fix unpooling --- mshadow/extension/spatial_unpool.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h index 441e04b3b815..848b77bb39fa 100644 --- a/mshadow/extension/spatial_unpool.h +++ b/mshadow/extension/spatial_unpool.h @@ -111,10 +111,11 @@ struct Plan, DType> { for (index_t py = py_min; py < py_max; ++py) { for (index_t px = px_min; px < px_max; ++px) { val += Reducer::PartialGrad(vsrc, - data_pooled_.Eval(c * pshape_y_ + py, px) * - grad_pooled_.Eval(c * pshape_y_ + py, px)); + data_pooled_.Eval(c * pshape_y_ + py, px)) * + grad_pooled_.Eval(c * pshape_y_ + py, px); } } + return val; } From 7e7de843605842c3ee5a2b5f9854deb399284b31 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 2 Jan 2015 20:46:57 -0800 Subject: [PATCH 052/147] convert up to conv layer --- mshadow/base.h | 6 +++--- mshadow/io.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mshadow/base.h b/mshadow/base.h index b733354b3899..733c532a403d 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -18,7 +18,7 @@ * mshadow should compile without any of other libs */ #ifndef MSHADOW_STAND_ALONE -#define MSHADOW_STAND_ALONE 1 +#define MSHADOW_STAND_ALONE 0 #endif /*! \brief whether do padding during allocation */ #ifndef MSHADOW_ALLOC_PAD @@ -121,8 +121,8 @@ extern "C" { * template arguments can be detected */ #ifndef MSHADOW_DEFAULT_DTYPE -//#define MSHADOW_DEFAULT_DTYPE = default_real_t -#define MSHADOW_DEFAULT_DTYPE +#define MSHADOW_DEFAULT_DTYPE = default_real_t +//#define MSHADOW_DEFAULT_DTYPE #endif /*! \brief namespace for mshadow */ diff --git a/mshadow/io.h b/mshadow/io.h index 8034609bc1dd..04e1681766ac 100644 --- a/mshadow/io.h +++ b/mshadow/io.h @@ -69,7 +69,7 @@ inline void LoadBinary(TStream &fi, // implementations template inline void SaveBinary(TStream &fo, const Tensor &src_) { - fo.Write(src_.shape_, sizeof(src_.shape_)); + fo.Write(&src_.shape_, sizeof(src_.shape_)); Tensor src = src_.FlatTo2D(); for (index_t i = 0; i < src.size(0); ++i) { fo.Write(src[i].dptr_, sizeof(DType) * src.size(1)); @@ -96,7 +96,7 @@ inline void LoadBinary(TStream &fi, dst_->shape_ = shape; AllocSpace(dst_); } Tensor dst = dst_->FlatTo2D(); - if (dst.shape[1] == 0) return; + if (dst.size(0) == 0) return; for (index_t i = 0; i < dst.size(0); ++i) { utils::Check(fi.Read(dst[i].dptr_, sizeof(DType) * dst.size(1)) != 0, "mshadow::LoadBinary"); From 25c4573e617175d21e30fa4c43932d160f0bd846 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 2 Jan 2015 21:22:18 -0800 Subject: [PATCH 053/147] finish refactor layers --- mshadow/expr_engine-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 5b7f4f8d7d13..26d99e948149 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -134,7 +134,7 @@ class Plan, DType> { template struct Plan, DType> { public: - explicit Plan(const Plan &src) : src_(src) {} + Plan(const Plan &src) : src_(src) {} MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { return src_.Eval(y, x); } From 0c274487d3ad8569f8b93a938de105b5971d82bd Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 2 Jan 2015 21:42:58 -0800 Subject: [PATCH 054/147] change --- mshadow/tensor.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 149eae19a1ff..73f4add5aac7 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -262,7 +262,7 @@ struct Tensor: public TRValue, * \tparam startdim the starting dimension */ template - MSHADOW_XINLINE size_t MSize(void) const { + MSHADOW_XINLINE size_t MemSize(void) const { size_t memsz = this->stride_; #pragma unroll for (int i = startdim; i < kSubdim; ++i) { @@ -270,6 +270,12 @@ struct Tensor: public TRValue, } return memsz; } + /*! + * \return memory cost of the tensor, including the aligned x dimension + */ + MSHADOW_XINLINE size_t MSize(void) const { + return this->MemSize<0>(); + } /*! * \brief return size of i-th dimension, start counting from highest dimension * \param the dimension count from the highest dimensin @@ -291,7 +297,7 @@ struct Tensor: public TRValue, * \return the result tensor */ MSHADOW_XINLINE Tensor operator[](index_t idx) const { - return Tensor(dptr_ + this->MSize<1>() * idx, + return Tensor(dptr_ + this->MemSize<1>() * idx, shape_.SubShape(), stride_); } /*! @@ -304,7 +310,7 @@ struct Tensor: public TRValue, Slice(index_t begin, index_t end) const { Shape s = this->shape_; s[0] = end - begin; - return Tensor(dptr_ + this->MSize<1>() * begin, + return Tensor(dptr_ + this->MemSize<1>() * begin, s, stride_); } /*!\brief functions to fit expression template */ @@ -355,6 +361,9 @@ struct Tensor: s[0] = end - begin; return Tensor(dptr_ + begin, s); } + MSHADOW_XINLINE size_t MSize(void) const { + return shape_[0]; + } MSHADOW_XINLINE index_t size(index_t i) const { return shape_[0]; } From 193e19f9948eec0abebe57f544de61b8d7125b98 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 3 Jan 2015 17:03:06 -0800 Subject: [PATCH 055/147] fix macro --- mshadow/tensor_gpu-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index f6d91d67b83a..52dee8768012 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -160,7 +160,7 @@ inline void Softmax(Tensor dst, const Tensor& src) { cuda::Softmax(dst, src); } +} // namespace mshadow #endif // __CUDACC__ #endif // MSHADOW_USE_CUDA -} // namespace mshadow #endif // MSHADOW_TENSOR_GPU_INL_H_ From cbe8f7aca32355999d2f44248f958c4bf72a28b1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 3 Jan 2015 17:08:35 -0800 Subject: [PATCH 056/147] fi no cuda --- mshadow/tensor_gpu-inl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 52dee8768012..7cff5c22e8ea 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -97,7 +97,9 @@ inline void Copy(Tensor dst, const Tensor &src) { Copy(dst, src, cudaMemcpyHostToDevice); } +#endif // MSHADOW_USE_CUDA } // namespace mshadow + // the following part is included only if compiler is nvcc #ifdef __CUDACC__ #include "./cuda/tensor_gpu-inl.cuh" @@ -162,5 +164,4 @@ inline void Softmax(Tensor dst, } } // namespace mshadow #endif // __CUDACC__ -#endif // MSHADOW_USE_CUDA #endif // MSHADOW_TENSOR_GPU_INL_H_ From bd70c42cd8a2fa141260f2c600c3520ea7e5889f Mon Sep 17 00:00:00 2001 From: winsty Date: Sun, 4 Jan 2015 13:45:13 +0800 Subject: [PATCH 057/147] add stride and pad to channel pooling --- mshadow/extension/channel_pool.h | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h index 0b4b3cbf8a2b..973dbb49ab82 100644 --- a/mshadow/extension/channel_pool.h +++ b/mshadow/extension/channel_pool.h @@ -26,11 +26,13 @@ struct ChannelPoolingExp: const SrcExp &src_; /*! \brief neighbor size */ index_t nsize_; + /*! \brief stride of pooling */ + index_t stride_; + /*! \brief pad of pooling of each side */ + index_t pad_; /*! \brief constructor */ - ChannelPoolingExp(const SrcExp &src, index_t nsize) - : src_(src), nsize_(nsize) { - utils::Check(nsize % 2 == 1, - "chpool: local size must be odd"); + ChannelPoolingExp(const SrcExp &src, index_t nsize, index_t stride, index_t pad) + : src_(src), nsize_(nsize), stride_(stride), pad_(pad) { this->shape_ = ShapeCheck::Check(src_); utils::Check(this->shape_[srcdim - 3] >= nsize_, "chpool: local size must be smaller than nchannels"); @@ -52,9 +54,21 @@ inline ChannelPoolingExp::kDim> chpool(const Exp &src, index_t nsize) { TypeCheckPass::kDim >= 3> ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(nsize % 2 == 1, + "chpool: if no pad is specified, local size must be odd"); return ChannelPoolingExp::kDim>(src.self(), nsize); + DType, ExpInfo::kDim>(src.self(), nsize, 1, nsize / 2); } + +template +inline ChannelPoolingExp::kDim> +chpool(const Exp &src, index_t nsize, index_t stride, index_t pad) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelPoolingExp::kDim>(src.self(), nsize, stride, pad); +} + //---------------------- // Execution plan //---------------------- @@ -64,7 +78,7 @@ struct Plan, DType> { explicit Plan(const ChannelPoolingExp &e) : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]), height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]), - hnsize_(e.nsize_ / 2) {} + hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_){} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { using namespace std; const index_t y = i % height_; @@ -72,8 +86,8 @@ struct Plan, DType> { const index_t c = i % channel_; const index_t n = i / channel_; const index_t x = j; - const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; - const index_t cend = min(c + hnsize_ + 1, channel_); + const index_t cstart = c * stride_ - pad_ < 0 ? 0 : c * stride_ - pad_; + const index_t cend = min(cstart + hnsize_, channel_); DType res; Reducer::SetInitValue(res); for (index_t cc = cstart; cc < cend; ++cc) { Reducer::Reduce(res, src_.Eval((n * channel_ + cc) * height_ + y, x)); @@ -82,7 +96,7 @@ struct Plan, DType> { } private: Plan src_; - const index_t channel_, height_, width_, hnsize_; + const index_t channel_, height_, width_, hnsize_, stride_, pad_; }; } // namespace expr } // namespace mshadow From 7f954a4f39338924afaeeaeff613e46caf8b1124 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 05:32:56 -0800 Subject: [PATCH 058/147] add stream basic, change mapexp to RValue compatible --- example/basic.cpp | 4 ++ mshadow/cuda/tensor_gpu-inl.cuh | 8 +-- mshadow/expr_engine-inl.h | 21 ++++---- mshadow/expression.h | 35 ++++++------ mshadow/extension/reduceto1d.h | 6 ++- mshadow/tensor.h | 47 ++++++++++++++--- mshadow/tensor_cpu-inl.h | 23 ++++++-- mshadow/tensor_gpu-inl.h | 94 +++++++++++++++++++++++++++------ 8 files changed, 175 insertions(+), 63 deletions(-) diff --git a/example/basic.cpp b/example/basic.cpp index 2bae1a42729f..6cf6e5f4c942 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -6,6 +6,7 @@ using namespace mshadow; using namespace mshadow::expr; int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS //InitTensorEngine(); // assume we have a float space @@ -18,6 +19,9 @@ int main(void) { Tensor mat2= NewTensor(Shape1(2), 0.0f); Tensor ts1= NewTensor(ts.shape_, 0.0f); Random rnd(0); + ts.stream_ = NewStream(); + DeleteStream(ts.stream_); + mat2[1] = 10; mat2 = rnd.uniform(mat2.shape_); // shape of matrix, note shape order is different from numpy diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index b69bd2841140..eac83a10d87c 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -36,8 +36,8 @@ const int kMaxGridNum = 65535; /*! \brief suggested grid number for mapping kernel */ const int kBaseGridNum = 1024; /*! \brief get align stride for given size in x dimension */ -inline index_t GetAlignStride(index_t xsize, index_t xstride) { - if ((xstride & (kMemUnit - 1)) == 0) { +inline index_t GetAlignStride(index_t xsize) { + if (xsize >= MSHADOW_MIN_PAD_RATIO * 32) { return ((xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; } else { // if originally space is not aligned, no necessary to to alligned thread allocation @@ -82,8 +82,8 @@ __global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride, template inline void MapPlan(expr::Plan dst, const expr::Plan &plan, - Shape<2> dshape, index_t dstride) { - const index_t xstride = GetAlignStride(dshape[1], dstride); + Shape<2> dshape) { + const index_t xstride = GetAlignStride(dshape[1]); const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum; dim3 dimBlock(kBaseThreadNum, 1, 1); diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 26d99e948149..73269c3db398 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -360,37 +360,38 @@ struct ShapeCheck > { namespace mshadow { namespace expr { /*! \brief some engine that evaluate complex expression */ -template +template struct ExpComplexEngine { - inline static void Eval(Tensor *dst, const E &exp); + inline static void Eval(RV *dst, const E &exp); }; /*! \brief the engine that dispatches simple operations*/ -template -struct ExpEngine > { +template +struct ExpEngine { template - inline static void Eval(Tensor *dst, + inline static void Eval(RV *dst, const Exp &exp) { MapExp(dst, exp); } template - inline static void Eval(Tensor *dst, + inline static void Eval(RV *dst, const Exp &exp) { MapExp(dst, exp); } template - inline static void Eval(Tensor *dst, + inline static void Eval(RV *dst, const Exp &exp) { MapExp(dst, exp); } template - inline static void Eval(Tensor *dst, + inline static void Eval(RV *dst, const Exp &exp) { - ExpComplexEngine::Eval(dst, exp.self()); + ExpComplexEngine::Eval(dst->ptrself(), exp.self()); } }; template -struct ExpComplexEngine, DotExp, Tensor, ltrans, rtrans, DType>, diff --git a/mshadow/expression.h b/mshadow/expression.h index b5c26d5e65df..428f22fdc0d2 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -46,12 +46,11 @@ const int kComplex = 7; * \tparam RValue the type of RValue to be saved * \sa namespace sv */ -template -struct ExpEngine { - /*! \brief defines how expression exp can be evaluated and stored into dst */ - template - inline static void Eval(RValue *dst, const EType &exp); -}; +template +struct ExpEngine; +/*! \brief defines how expression exp can be evaluated and stored into dst */ +//template +//inline static void Eval(RValue *dst, const EType &exp); /*! * \brief base class for expression * \tparam SubType inheritated class must put their type into this parameter @@ -139,66 +138,66 @@ class RValueExp: public Exp { } /*! \brief operator overload */ inline Container &operator+=(DType s) { - ExpEngine::Eval(this->ptrself(), scalar(s)); + ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } inline Container &operator-=(DType s) { - ExpEngine::Eval(this->ptrself(), scalar(s)); + ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } inline Container &operator*=(DType s) { - ExpEngine::Eval(this->ptrself(), scalar(s)); + ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } inline Container &operator/=(DType s) { - ExpEngine::Eval(this->ptrself(), scalar(s)); + ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } /*! \brief operator overload */ inline Container &__assign(DType s) { - ExpEngine::Eval(this->ptrself(), scalar(s)); + ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } /*! \brief we can not define container = container */ template inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } /*! \brief we can not define conatiner = container */ template inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } /*! \brief we can not define container = container */ template inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } /*! \brief implementation of operator+= */ template inline Container &operator+=(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } /*! \brief implementation of operator-= */ template inline Container &operator-=(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } /*! \brief implementation of operator*= */ template inline Container &operator*=(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } /*! \brief implementation of operator/= */ template inline Container &operator/=(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); + ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } }; diff --git a/mshadow/extension/reduceto1d.h b/mshadow/extension/reduceto1d.h index cfab77cb6659..b35e88c3153f 100644 --- a/mshadow/extension/reduceto1d.h +++ b/mshadow/extension/reduceto1d.h @@ -61,7 +61,8 @@ sum_rows(const Exp &exp) { } template -struct ExpComplexEngine, ReduceTo1DExp, DType> { static const int dimkeep = ExpInfo::kDim - m_dimkeep; @@ -75,7 +76,8 @@ struct ExpComplexEngine -struct ExpComplexEngine, ReduceTo1DExp, DType> { inline static void Eval(Tensor *dst, const ReduceTo1DExp &exp) { diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 73f4add5aac7..78549ed1ad5e 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -20,14 +20,14 @@ struct cpu { /*! \brief whether this device is CPU or not */ static const bool kDevCPU = true; /*! \brief device flag number, identifies this device */ - static const int kDevMask = 1<<0; + static const int kDevMask = 1 << 0; }; /*! \brief device name CPU */ struct gpu { /*! \brief whether this device is CPU or not */ static const bool kDevCPU = false; /*! \brief device flag number, identifies this device */ - static const int kDevMask = 1<<1; + static const int kDevMask = 1 << 1; }; /*! * \brief shape of a tensor @@ -196,6 +196,20 @@ MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1, */ template struct Stream { + // this is only a dummy implementation for CPU + // for GPU, the actual implementation will be specialized in tensor_gpu-inl.h + /*! + * \brief wait for all the computation associated + * with this stream to complete + */ + inline void Wait(void) {} + /*! + * \brief query whether the the stream is idle + * \return true if the stream is idle and all the job have been completed + */ + inline bool CheckIdle(void) { + return true; + } }; /*! * \brief Tensor RValue, this is the super type of all kinds of possible tensors @@ -406,8 +420,20 @@ inline void InitTensorEngine(int device_id = 0); * \brief Shutdown tensor engine, * this function should be called after all GPU tensor operations, * for using tensors in CPU, this call is actually not needed - */inline void ShutdownTensorEngine(void); - + */ +inline void ShutdownTensorEngine(void); +/*! + * \brief create a new stream from system + * \return a pointer to the created stream + */ +template +inline Stream *NewStream(void); +/*! + * \brief delete the computing stream + * \param stream the stream parameter to be deleted + */ +template +inline void DeleteStream(Stream *stream); /*! * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj * this function is responsible to set the stride_ in each obj.shape @@ -455,24 +481,29 @@ inline Tensor NewTensor(const Shape &shape, * \brief copy data from one tensor to another, with same shape * \param dst target tensor * \param src source tensor + * \param stream the stream, when specified, the copy can exhibit asynchronize behavior * \tparam dim specify the dim of tensor * \tparam DType type of element in tensor */ template inline void Copy(Tensor dst, - const Tensor &src); + const Tensor &src, + Stream *stream = NULL); /*! \brief refer to comment of cpu ver \sa Copy */ template inline void Copy(Tensor dst, - const Tensor &src); + const Tensor &src, + Stream *stream = NULL); /*! \brief refer to comment of cpu ver \sa Copy */ template inline void Copy(Tensor dst, - const Tensor &src); + const Tensor &src, + Stream *stream = NULL); /*! \brief refer to comment of cpu ver \sa Copy */ template inline void Copy(Tensor dst, - const Tensor &src); + const Tensor &src, + Stream *stream = NULL); /*! * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) * \param dst destination diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 62951f14aa45..3bc46e1a0d71 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -12,6 +12,15 @@ #include "./sse-inl.h" namespace mshadow { +template<> +inline Stream *NewStream(void) { + return new Stream(); +} +template<> +inline void DeleteStream(Stream *stream) { + delete stream; +} + template inline void AllocSpace(Tensor *obj, bool pad) { size_t pitch; @@ -42,7 +51,8 @@ inline void FreeSpace(Tensor *obj) { } template inline void Copy(Tensor _dst, - const Tensor &_src) { + const Tensor &_src, + Stream *stream) { utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); Tensor dst = _dst.FlatTo2D(); Tensor src = _src.FlatTo2D(); @@ -54,7 +64,7 @@ template inline void MapPlan(TRValue *dst, const expr::Plan &plan) { - Shape<2> shape = dst->self().shape_.FlatTo2D(); + Shape<2> shape = expr::ShapeCheck::Check(dst->self()).FlatTo2D(); expr::Plan dplan = expr::MakePlan(dst->self()); for (index_t y = 0; y < shape[0]; ++y) { for (index_t x = 0; x < shape[1]; ++x) { @@ -97,7 +107,8 @@ inline void MapExp(TRValue *dst, expr::TypeCheckPass::kMapPass> ::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); Shape eshape = expr::ShapeCheck::Check(exp.self()); - utils::Check(eshape[0] == 0 || eshape == dst->self().shape_, + Shape dshape = expr::ShapeCheck::Check(dst->self()); + utils::Check(eshape[0] == 0 || eshape == dshape, "Assignment: Shape of Tensors are not consistent with target"); #if MSHADOW_USE_SSE MapExpCPUEngine::kPass, Saver, R, dim, DType, E, etype> @@ -116,7 +127,8 @@ inline void MapReduceKeepLowest(TRValue *dst, ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); Shape<2> eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()).FlatTo2D(); - utils::Check(eshape[1] == dst->self().size(0), + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[1] == dshape[0], "MapReduceKeepLowest::reduction dimension do not match"); utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); // execution @@ -141,7 +153,8 @@ inline void MapReduceKeepHighDim(TRValue *dst, typedef Shape::kDim> EShape; EShape eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()); - utils::Check(eshape[dimkeep] == dst->self().size(0), + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[dimkeep] == dshape[0], "MapReduceKeepHighDim::reduction dimension do not match"); // use equvalent form Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 7cff5c22e8ea..7c37331c72a3 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -17,6 +17,55 @@ inline void InitTensorEngine(int dev_id) { inline void ShutdownTensorEngine(void) { } #else +// Stream alocation +// actual implementation of GPU stream in CUDA +template<> +struct Stream { + /*! \brief cudaStream */ + cudaStream_t stream_; + /*! + * \brief wait for all the computation associated + * with this stream to complete + */ + inline void Wait(void) { + cudaError_t err = cudaStreamSynchronize(stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } + /*! + * \brief query whether the the stream is idle + * \return true if the stream is idle and all the job have been completed + */ + inline bool CheckIdle(void) { + cudaError_t err = cudaStreamQuery(stream_); + if (err == cudaSuccess) return true; + if (err == cudaErrorNotReady) return false; + utils::Error(cudaGetErrorString(err)); + return false; + } + /*! + * \brief returns actual cudaStream_t given an input GPU stream pointer + * \param stream pointer to GPU stream + */ + inline static cudaStream_t GetStream(Stream *stream) { + if (stream == NULL) return 0; + else return stream.stream_; + } +}; + +template<> +inline Stream *NewStream(void) { + Stream st = new Stream(); + cudaError_t err = cudaStreamCreate(st->stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + return st; +} +template<> +inline void DeleteStream(Stream *stream) { + cudaError_t err = cudaStreamDestroy(stream->stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + delete stream; +} + #if (MSHADOW_USE_NVML) inline int AutoSelectDevice(int device_count) { // TODO(bing): nvml device id and cuda device id are not consistent @@ -46,6 +95,7 @@ inline void InitTensorEngine(int dev_id) { inline void ShutdownTensorEngine(void) { cublasShutdown(); } + template inline void AllocSpace(Tensor *obj, bool pad) { size_t pitch; @@ -72,30 +122,40 @@ inline void FreeSpace(Tensor *obj) { template inline void Copy(Tensor _dst, Tensor _src, - cudaMemcpyKind kind) { + cudaMemcpyKind kind, + Stream *stream) { utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); Tensor dst = _dst.FlatTo2D(); Tensor src = _src.FlatTo2D(); - cudaError_t err = cudaMemcpy2D(dst.dptr_, dst.stride_ * sizeof(DType), - src.dptr_, src.stride_ * sizeof(DType), - dst.size(1) * sizeof(DType), - dst.size(0), kind); + cudaError_t err = cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType), + src.dptr_, src.stride_ * sizeof(DType), + dst.size(1) * sizeof(DType), + dst.size(0), kind, + Stream::GetStream(stream)); utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + // use synchronize call behavior for zero stream + if (stream == NULL) { + err = cudaStreamSynchronize(0); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } } template inline void Copy(Tensor dst, - const Tensor &src) { - Copy(dst, src, cudaMemcpyDeviceToHost); + const Tensor &src, + Stream *stream) { + Copy(dst, src, cudaMemcpyDeviceToHost, stream); } template inline void Copy(Tensor dst, - const Tensor &src) { - Copy(dst, src, cudaMemcpyDeviceToDevice); + const Tensor &src, + Stream *stream) { + Copy(dst, src, cudaMemcpyDeviceToDevice, stream); } template inline void Copy(Tensor dst, - const Tensor &src) { - Copy(dst, src, cudaMemcpyHostToDevice); + const Tensor &src, + Stream *stream) { + Copy(dst, src, cudaMemcpyHostToDevice, stream); } #endif // MSHADOW_USE_CUDA } // namespace mshadow @@ -112,12 +172,12 @@ inline void MapExp(TRValue *dst, expr::TypeCheckPass::kMapPass> ::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); Shape eshape = expr::ShapeCheck::Check(exp.self()); - utils::Check(eshape[0] == 0 || eshape == dst->self().shape_, + Shape dshape = expr::ShapeCheck::Check(dst->self()); + utils::Check(eshape[0] == 0 || eshape == dshape, "Assignment: Shape of Tensors are not consistent with target"); cuda::MapPlan(MakePlan(dst->self()), MakePlan(exp.self()), - dst->self().shape_.FlatTo2D(), - dst->self().stride_); + dshape.FlatTo2D()); } template *dst, ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); Shape<2> eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()).FlatTo2D(); - utils::Check(eshape[1] == dst->self().size(0), + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[1] == dshape[0], "MapReduceKeepLowest::reduction dimension do not match"); utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); cuda::MapReduceKeepLowest @@ -146,7 +207,8 @@ inline void MapReduceKeepHighDim(TRValue *dst, typedef Shape::kDim> EShape; EShape eshape = expr::ShapeCheck::kDim, E> ::Check(exp.self()); - utils::Check(eshape[dimkeep] == dst->self().size(0), + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[dimkeep] == dshape[0], "MapReduceKeepHighDim::reduction dimension do not match"); // use equvalent form Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), From 32b635a49c1318c467002077fa2b99f8d455c264 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 06:11:49 -0800 Subject: [PATCH 059/147] fix stream --- mshadow/tensor_gpu-inl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 7c37331c72a3..6cac49872bcb 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -48,14 +48,14 @@ struct Stream { */ inline static cudaStream_t GetStream(Stream *stream) { if (stream == NULL) return 0; - else return stream.stream_; + else return stream->stream_; } }; template<> inline Stream *NewStream(void) { - Stream st = new Stream(); - cudaError_t err = cudaStreamCreate(st->stream_); + Stream *st = new Stream(); + cudaError_t err = cudaStreamCreate(&st->stream_); utils::Check(err == cudaSuccess, cudaGetErrorString(err)); return st; } From 234ef182a330de909bf585c77c893a58b218944b Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sun, 4 Jan 2015 15:09:44 -0800 Subject: [PATCH 060/147] concat exp --- mshadow/extension/concat.h | 88 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 mshadow/extension/concat.h diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h new file mode 100644 index 000000000000..c4eb596b1b8a --- /dev/null +++ b/mshadow/extension/concat.h @@ -0,0 +1,88 @@ + +#ifndef MSHADOW_EXTENSION_CONCAT_H_ +#define MSHADOW_EXTENSION_CONCAT_H_ + +#include "mshadow/extension.h" + +namespace mshadow { + namespace expr { + /*! + * \brief concat expression, concat two tensor's channel + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ + template + struct ConcatExp : public MakeTensorExp, + SrcExp, srcdim, DType> { + const SrcExp &src1_; + const SrcExp &src2_; + index_t height_; + index_t width_; + index_t ch_src1_; + index_t ch_src2_; + ConcatExp(const SrcExp &src1, const SrcExp &src2) : src1_(src1), src2_(src2) { + Shape sshape1 = ShapeCheck::Check(src1_); + Shape sshape2 = ShapeCheck::Check(src2_); + utils::Check(sshape1[srcdim - 2] == sshape2[srcdim - 2], + "ConcatExp: height requirement not met"); + utils::Check(sshape1[srcdim - 1] == sshape2[srcdim - 1], + "ConcatExp: width requirement not met"); + utils::Check(sshape1[0] == sshape2[0], + "ConcatExp: batch requirement not met"); + this->shape_ = sshape1; + this->shape_[1] = sshape1[1] + sshape2[1]; + this->ch_src1_ = sshape1[1]; + this->ch_src2_ = sshape2[1]; + this->height_ = sshape1[2]; + this->width_ = sshape1[3]; + } + }; // struct ConcatExp + /*! + * \brief concat two 4D tensor + * \param src1 source tensor1 + * \param src2 source tensor2 + * \return concated 4D tensor + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ + template + inline ConcatExp::kDim> + concat(const Exp &src1, const Exp &src2) { + TypeCheckPass::kDim == 4> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ConcatExp::kDim>(src1.self(), src2.self()); + } + + //---------------------- + // Execution plan + //--------------------- + template + struct Plan, DType> { + public: + explicit Plan(const ConcatExp &e) : + src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), + height_(e.height_), width_(e.width_), + ch_src1_(e.ch_src1_), ch_src2_(e.ch_src2_), ch_(e.ch_src1_ + e.ch_src2_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t y = i % height_; + i /= height_; + const index_t c = i % ch_; + const index_t x = j; + if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); + else return src2_.Eval((c - ch_src1_) * height_ + y, x); + } + private: + Plan src1_; + Plan src2_; + const index_t height_, width_, ch_src1_, ch_src2_, ch_; + }; // struct Plan + + }// namespace expr +} // namespace mshadow + + + + +#endif // MSHADOW_EXTENSION_CONCAT_H_ From b3f63023393b7d0cd18ed2cda436646335866df8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 17:19:03 -0800 Subject: [PATCH 061/147] add rvalue concat --- example/Makefile | 4 +- example/basic.cpp | 4 +- mshadow/expression.h | 18 +--- mshadow/extension.h | 1 + mshadow/extension/concat.h | 191 ++++++++++++++++++++++--------------- mshadow/tensor.h | 40 ++++---- 6 files changed, 144 insertions(+), 114 deletions(-) diff --git a/example/Makefile b/example/Makefile index 8dc0aee554a9..51e6588bc94b 100644 --- a/example/Makefile +++ b/example/Makefile @@ -1,8 +1,8 @@ # set LD_LIBRARY_PATH export CC = gcc -export CXX = g++ +export CXX = clang++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 export LDFLAGS= -lm export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) diff --git a/example/basic.cpp b/example/basic.cpp index 6cf6e5f4c942..d4f55b824884 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -13,6 +13,7 @@ int main(void) { double data[20]; // create a 2 x 5 x 2 tensor, from existing space Tensor ts(data, Shape3(2,5,2)); + Tensor ts4(data, Shape4(2,2,2,2)); // take first subscript of the tensor Tensor mat = ts[0]; // Tensor object is only a handle, assignment means they have same data content @@ -22,7 +23,8 @@ int main(void) { ts.stream_ = NewStream(); DeleteStream(ts.stream_); - mat2[1] = 10; + concat(ts4, ts4) = ts4; + mat2 = rnd.uniform(mat2.shape_); // shape of matrix, note shape order is different from numpy // shape[i] indicate the shape of i-th dimension diff --git a/mshadow/expression.h b/mshadow/expression.h index 428f22fdc0d2..c45568274aa2 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -159,23 +159,13 @@ class RValueExp: public Exp { return *(this->ptrself()); } /*! \brief we can not define container = container */ - template - inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); - return *(this->ptrself()); - } - /*! \brief we can not define conatiner = container */ - template - inline Container &__assign(const Exp &exp) { - ExpEngine::Eval(this->ptrself(), exp.self()); - return *(this->ptrself()); - } - /*! \brief we can not define container = container */ - template - inline Container &__assign(const Exp &exp) { + template + inline Container &__assign(const Exp &exp) { ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } + // declar but not implement the assign to self type + inline Container &__assign(const Exp &exp); /*! \brief implementation of operator+= */ template inline Container &operator+=(const Exp &exp) { diff --git a/mshadow/extension.h b/mshadow/extension.h index f9e9badeb82b..865d8fe5bc5b 100644 --- a/mshadow/extension.h +++ b/mshadow/extension.h @@ -20,5 +20,6 @@ #include "./extension/pad.h" #include "./extension/crop.h" #include "./extension/mirror.h" +#include "./extension/concat.h" #endif diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index c4eb596b1b8a..465755638dae 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -1,88 +1,129 @@ - #ifndef MSHADOW_EXTENSION_CONCAT_H_ #define MSHADOW_EXTENSION_CONCAT_H_ #include "mshadow/extension.h" namespace mshadow { - namespace expr { - /*! - * \brief concat expression, concat two tensor's channel - * \tparam SrcExp source expression - * \tparam DType the type of elements - * \tparam srcdim dimension of src - */ - template - struct ConcatExp : public MakeTensorExp, - SrcExp, srcdim, DType> { - const SrcExp &src1_; - const SrcExp &src2_; - index_t height_; - index_t width_; - index_t ch_src1_; - index_t ch_src2_; - ConcatExp(const SrcExp &src1, const SrcExp &src2) : src1_(src1), src2_(src2) { - Shape sshape1 = ShapeCheck::Check(src1_); - Shape sshape2 = ShapeCheck::Check(src2_); - utils::Check(sshape1[srcdim - 2] == sshape2[srcdim - 2], +namespace expr { +/*! + * \brief concat expression, concat two tensor's channel + * \tparam LhsExp left expression + * \tparam RhsExp right expression + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct ConcatExp : public TRValue, + Device, srcdim, DType> { + const LhsExp &src1_; + const RhsExp &src2_; + index_t height_; + index_t width_; + index_t ch_src1_; + index_t ch_src2_; + Shape<4> shape_; + ConcatExp(const LhsExp &src1, const RhsExp &src2) : src1_(src1), src2_(src2) { + Shape sshape1 = ShapeCheck::Check(src1_); + Shape sshape2 = ShapeCheck::Check(src2_); + utils::Check(sshape1[srcdim - 2] == sshape2[srcdim - 2], "ConcatExp: height requirement not met"); - utils::Check(sshape1[srcdim - 1] == sshape2[srcdim - 1], + utils::Check(sshape1[srcdim - 1] == sshape2[srcdim - 1], "ConcatExp: width requirement not met"); - utils::Check(sshape1[0] == sshape2[0], + utils::Check(sshape1[0] == sshape2[0], "ConcatExp: batch requirement not met"); - this->shape_ = sshape1; - this->shape_[1] = sshape1[1] + sshape2[1]; - this->ch_src1_ = sshape1[1]; - this->ch_src2_ = sshape2[1]; - this->height_ = sshape1[2]; - this->width_ = sshape1[3]; - } - }; // struct ConcatExp - /*! - * \brief concat two 4D tensor - * \param src1 source tensor1 - * \param src2 source tensor2 - * \return concated 4D tensor - * \tparam SrcExp source expression - * \tparam DType the type of elements - * \tparam etype type of expression - */ - template - inline ConcatExp::kDim> - concat(const Exp &src1, const Exp &src2) { - TypeCheckPass::kDim == 4> - ::Error_Expression_Does_Not_Meet_Dimension_Req(); - return ConcatExp::kDim>(src1.self(), src2.self()); - } - - //---------------------- - // Execution plan - //--------------------- - template - struct Plan, DType> { - public: - explicit Plan(const ConcatExp &e) : + this->shape_ = sshape1; + this->shape_[1] = sshape1[1] + sshape2[1]; + this->ch_src1_ = sshape1[1]; + this->ch_src2_ = sshape2[1]; + this->height_ = sshape1[2]; + this->width_ = sshape1[3]; + } + template + inline void + operator=(const expr::Exp &exp) { + this->__assign(exp); + } +}; // struct ConcatExp +/*! + * \brief concat two 4D tensor + * \param src1 source tensor1 + * \param src2 source tensor2 + * \return concated 4D tensor + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline ConcatExp::kDim> +concat(const TRValue &src1, + const TRValue &src2) { + TypeCheckPass::kDim == 4> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + TypeCheckPass::kDim == 4> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ConcatExp::kDim> + (src1.self(), src2.self()); +} +//------------------------ +// engine plugin +//------------------------ +// runtime shapecheck +template +struct ShapeCheck >{ + inline static Shape Check(const ConcatExp &t) { + return t.shape_; + } +}; +// static typecheck +template +struct ExpInfo >{ + static const int kDimLhs = ExpInfo::kDim; + static const int kDimRhs = ExpInfo::kDim; + // copy from binarymap + static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\ + (kDimLhs == 0 ?\ + kDimRhs :\ + ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1; + static const int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; +}; +//---------------------- +// Execution plan +//--------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ConcatExp &e) : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), - height_(e.height_), width_(e.width_), - ch_src1_(e.ch_src1_), ch_src2_(e.ch_src2_), ch_(e.ch_src1_ + e.ch_src2_) {} - MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { - const index_t y = i % height_; - i /= height_; - const index_t c = i % ch_; - const index_t x = j; - if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); - else return src2_.Eval((c - ch_src1_) * height_ + y, x); - } - private: - Plan src1_; - Plan src2_; - const index_t height_, width_, ch_src1_, ch_src2_, ch_; - }; // struct Plan + height_(e.height_), width_(e.width_), + ch_src1_(e.ch_src1_), ch_src2_(e.ch_src2_), ch_(e.ch_src1_ + e.ch_src2_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t y = i % height_; + i /= height_; + const index_t c = i % ch_; + const index_t x = j; + if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); + else return src2_.Eval((c - ch_src1_) * height_ + y, x); + } + MSHADOW_XINLINE DType &Eval(index_t i, index_t j) { + const index_t y = i % height_; + i /= height_; + const index_t c = i % ch_; + const index_t x = j; + if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); + else return src2_.Eval((c - ch_src1_) * height_ + y, x); + } + private: + Plan src1_; + Plan src2_; + const index_t height_, width_, ch_src1_, ch_src2_, ch_; +}; // struct Plan - }// namespace expr +}// namespace expr } // namespace mshadow - - - - #endif // MSHADOW_EXTENSION_CONCAT_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 78549ed1ad5e..af9315ba5839 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -327,22 +327,19 @@ struct Tensor: public TRValue, return Tensor(dptr_ + this->MemSize<1>() * begin, s, stride_); } - /*!\brief functions to fit expression template */ - template - inline Tensor & - operator=(const expr::Exp &exp) { - return this->__assign(exp); - } - /*!\brief functions to fit expression template */ - template + /*!\brief implement the assignment of same type */ + template inline Tensor & - operator=(const expr::Exp &exp) { - return this->__assign(exp); + operator=(const Tensor &exp) { + dptr_ = exp.dptr; + shape_ = exp.shape_; + stride_ = exp.stride_; + stream_ = exp.stream_; } /*!\brief functions to fit expression template */ - template + template inline Tensor & - operator=(const expr::Exp &exp) { + operator=(const expr::Exp &exp) { return this->__assign(exp); } inline Tensor &operator=(const DType &exp) { @@ -387,19 +384,18 @@ struct Tensor: MSHADOW_XINLINE const DType &operator[](index_t idx) const { return dptr_[idx]; } - template + /*!\brief implement the assignment of same type */ + template inline Tensor & - operator=(const expr::Exp &exp) { - return this->__assign(exp); - } - template - inline Tensor & - operator=(const expr::Exp &exp) { - return this->__assign(exp); + operator=(const Tensor &exp) { + dptr_ = exp.dptr; + shape_ = exp.shape_; + stride_ = exp.stride_; + stream_ = exp.stream_; } - template + template inline Tensor & - operator=(const expr::Exp &exp) { + operator=(const expr::Exp &exp) { return this->__assign(exp); } inline Tensor &operator=(const DType &exp) { From e1b61886d58f2d206ad0dd36c0bc2a82ecb2e225 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 18:52:53 -0800 Subject: [PATCH 062/147] gpu streams --- example/basic.cpp | 1 - mshadow/cuda/tensor_gpu-inl.cuh | 25 +++++++++++++++---------- mshadow/dot_engine-inl.h | 11 +++++++++++ mshadow/expr_engine-inl.h | 16 +++++++++++++++- mshadow/random.h | 16 ++++++++++++++++ mshadow/tensor_gpu-inl.h | 11 +++++++---- 6 files changed, 64 insertions(+), 16 deletions(-) diff --git a/example/basic.cpp b/example/basic.cpp index d4f55b824884..d9780502a83e 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -23,7 +23,6 @@ int main(void) { ts.stream_ = NewStream(); DeleteStream(ts.stream_); - concat(ts4, ts4) = ts4; mat2 = rnd.uniform(mat2.shape_); // shape of matrix, note shape order is different from numpy diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index eac83a10d87c..4d98a69dbaea 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -82,7 +82,8 @@ __global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride, template inline void MapPlan(expr::Plan dst, const expr::Plan &plan, - Shape<2> dshape) { + Shape<2> dshape, + cudaStream_t stream) { const index_t xstride = GetAlignStride(dshape[1]); const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum; dim3 dimBlock(kBaseThreadNum, 1, 1); @@ -92,14 +93,14 @@ inline void MapPlan(expr::Plan dst, MapPlanKernel, expr::Plan > - <<>>(dst, xstride, dshape, plan); + <<>>(dst, xstride, dshape, plan); } else { int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; dim3 dimGrid(kBaseGridNum, 1 , 1); MapPlanLargeKernel, expr::Plan > - <<>>(dst, xstride, dshape, plan, repeat); + <<>>(dst, xstride, dshape, plan, repeat); } } @@ -137,14 +138,15 @@ template inline void MapReduceKeepLowest(expr::Plan dst, const expr::Plan &plan, - DType scale, Shape<2> eshape) { + DType scale, Shape<2> eshape, + cudaStream_t stream) { dim3 dimBlock(kMemUnit, kMemUnit); dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits); CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); MapRedKeepLowestKernel, expr::Plan > - <<>>(dst, plan, scale, eshape); + <<>>(dst, plan, scale, eshape); } template inline void MapReduceKeepDim1(expr::Plan dst, const expr::Plan &plan, - DType scale, Shape<4> pshape) { + DType scale, Shape<4> pshape, + cudaStream_t stream) { dim3 dimBlock(kBaseThreadNum); dim3 dimGrid (pshape[1]); CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim1"); MapReduceKeepDim1Kernel, expr::Plan > - <<>>(dst, plan, scale, pshape); + <<>>(dst, plan, scale, pshape); } template @@ -242,10 +245,12 @@ inline void Softmax(Tensor &dst, dim3 dimGrid(dst.size(0)); utils::Check(dst.shape_ == src.shape_, "Softmax: shape mismatch"); CheckLaunchParam(dimGrid, dimBlock, "Softmax"); + cudaStream_t stream = Stream::GetStream(dst.stream_); SoftmaxKernel - <<>>(expr::MakePlan(dst), - expr::MakePlan(src), - dst.size(1)); + <<>> + (expr::MakePlan(dst), + expr::MakePlan(src), + dst.size(1)); } } // namespace cuda } // namespace mshadow diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h index 4df71ccdbdfd..168441657baa 100644 --- a/mshadow/dot_engine-inl.h +++ b/mshadow/dot_engine-inl.h @@ -28,6 +28,8 @@ struct BLASEngine { inline static CBLAS_TRANSPOSE GetT(bool t) { return t ? CblasTrans : CblasNoTrans; } + inline static void SetStream(Stream *stream) { + } inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, @@ -76,6 +78,9 @@ struct BLASEngine { inline static char GetT(bool t) { return t ? 'T' : 'N'; } + inline static void SetStream(Stream *stream) { + cublasSetKernelStream(Stream::GetStream(stream)); + } inline static void gemm(bool transa, bool transb, int m, int n, int k, float alpha, const float *A, int lda, @@ -129,6 +134,8 @@ struct DotEngine { const Tensor &rhs, DType scale) { Tensor &dst = *p_dst; + // set kernel stream + BLASEngine::SetStream(dst.stream_); Shape<2> sleft = GetShape(lhs.shape_, transpose_left); Shape<2> sright = GetShape(rhs.shape_, transpose_right); utils::Check(dst.size(0) == sleft[0] && dst.size(1) == sright[1] \ @@ -154,6 +161,8 @@ struct DotEngine { const Tensor &rhs, DType scale) { Tensor &dst = *p_dst; + // set kernel stream + BLASEngine::SetStream(dst.stream_); Shape<2> sright = GetShape(rhs.shape, transpose_right); utils::Check(dst.size(0) == sright[1] && lhs.size(0) == sright[0], "dot-gemv: matrix shape mismatch"); @@ -172,6 +181,8 @@ struct DotEngine { const Tensor &rhs, DType scale) { Tensor &dst = *p_dst; + // set kernel stream + BLASEngine::SetStream(dst.stream_); utils::Check(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0), "dot-ger: matrix shape mismatch"); if (SV::kBetaBLAS == 0.0f) { diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index 73269c3db398..e80546c0b51a 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -281,6 +281,20 @@ struct TypeCheckPass { inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {} inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {} }; + +//---------------------------------------------------------------- +// Runtime Stream Getting +//---------------------------------------------------------------- +template +struct StreamInfo { + inline static Stream *Get(const E &t); +}; +template +struct StreamInfo > { + inline static Stream *Get(const Tensor &t) { + return t.stream_; + } +}; //---------------------------------------------------------------- // Runtime Shape Checking //---------------------------------------------------------------- @@ -291,7 +305,7 @@ struct TypeCheckPass { * \tparam E expression */ template -struct ShapeCheck{ +struct ShapeCheck { inline static Shape Check(const E &t); }; template diff --git a/mshadow/random.h b/mshadow/random.h index 432a5d107b3d..7a52a25d1ce6 100644 --- a/mshadow/random.h +++ b/mshadow/random.h @@ -49,6 +49,12 @@ class Random { #else this->rseed_ = static_cast(seed); #endif + } + /*! + * \brief set the stream of computation + * \param stream computation stream + */ + inline void SetStream(Stream *stream) { } /*! * \brief generate data from uniform [a,b) @@ -229,6 +235,16 @@ class Random { utils::Check(status == CURAND_STATUS_SUCCESS, "Destory CURAND Gen failed"); } + /*! + * \brief set the stream of computation + * \param stream computation stream + */ + inline void SetStream(Stream *stream) { + curandStatus_t status; + status = curandSetStream(gen_, Stream::GetStream(stream)); + utils::Check(status == CURAND_STATUS_SUCCESS, + "SetStream CURAND failed"); + } /*! * \brief seed random number generator using this seed * \param seed seed of prng diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 6cac49872bcb..c0533fd362d8 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -177,7 +177,8 @@ inline void MapExp(TRValue *dst, "Assignment: Shape of Tensors are not consistent with target"); cuda::MapPlan(MakePlan(dst->self()), MakePlan(exp.self()), - dshape.FlatTo2D()); + dshape.FlatTo2D(), + Stream::GetStream(StreamInfo::Get(dst->self()))); } template *dst, "MapReduceKeepLowest::reduction dimension do not match"); utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); cuda::MapReduceKeepLowest - (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape); + (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape, + StreamInfo::Get(dst->self())); } template *dst, eshape[EShape::kSubdim]); // call equavalent map red dim 2 cuda::MapReduceKeepDim1 - (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape); + (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape, + Stream::GetStream(StreamInfo::Get(dst->self()))); } template inline void Softmax(Tensor dst, const Tensor& src) { - cuda::Softmax(dst, src); + cuda::Softmax(dst, src, stream); } } // namespace mshadow #endif // __CUDACC__ From a5c5a3f669fe3d0dd54ef8febe413ab964459ae4 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 19:11:07 -0800 Subject: [PATCH 063/147] checkin stream --- mshadow/cuda/tensor_gpu-inl.cuh | 2 +- mshadow/tensor.h | 5 ++- mshadow/tensor_gpu-inl.h | 61 ++++----------------------------- 3 files changed, 11 insertions(+), 57 deletions(-) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 4d98a69dbaea..908877cabd9f 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -247,7 +247,7 @@ inline void Softmax(Tensor &dst, CheckLaunchParam(dimGrid, dimBlock, "Softmax"); cudaStream_t stream = Stream::GetStream(dst.stream_); SoftmaxKernel - <<>> + <<>> (expr::MakePlan(dst), expr::MakePlan(src), dst.size(1)); diff --git a/mshadow/tensor.h b/mshadow/tensor.h index af9315ba5839..841a1d7d34a2 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -335,6 +335,7 @@ struct Tensor: public TRValue, shape_ = exp.shape_; stride_ = exp.stride_; stream_ = exp.stream_; + return *this; } /*!\brief functions to fit expression template */ template @@ -391,7 +392,8 @@ struct Tensor: dptr_ = exp.dptr; shape_ = exp.shape_; stride_ = exp.stride_; - stream_ = exp.stream_; + stream_ = exp.stream_; + return *this; } template inline Tensor & @@ -584,6 +586,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, DType scale = 1); } // namespace mshadow // include headers +#include "./stream_gpu-inl.h" #include "./expr_engine-inl.h" #include "./extension.h" #include "./tensor_cpu-inl.h" diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index c0533fd362d8..ea8eaa4bd351 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -1,7 +1,7 @@ /*! * Copyright (c) 2014 by Contributors - * \file tensor_cpu-inl.h - * \brief implementation of CPU host code + * \file tensor_gpu-inl.h + * \brief implementation of GPU host code * \author Bing Xu, Tianqi Chen */ #ifndef MSHADOW_TENSOR_GPU_INL_H_ @@ -17,55 +17,6 @@ inline void InitTensorEngine(int dev_id) { inline void ShutdownTensorEngine(void) { } #else -// Stream alocation -// actual implementation of GPU stream in CUDA -template<> -struct Stream { - /*! \brief cudaStream */ - cudaStream_t stream_; - /*! - * \brief wait for all the computation associated - * with this stream to complete - */ - inline void Wait(void) { - cudaError_t err = cudaStreamSynchronize(stream_); - utils::Check(err == cudaSuccess, cudaGetErrorString(err)); - } - /*! - * \brief query whether the the stream is idle - * \return true if the stream is idle and all the job have been completed - */ - inline bool CheckIdle(void) { - cudaError_t err = cudaStreamQuery(stream_); - if (err == cudaSuccess) return true; - if (err == cudaErrorNotReady) return false; - utils::Error(cudaGetErrorString(err)); - return false; - } - /*! - * \brief returns actual cudaStream_t given an input GPU stream pointer - * \param stream pointer to GPU stream - */ - inline static cudaStream_t GetStream(Stream *stream) { - if (stream == NULL) return 0; - else return stream->stream_; - } -}; - -template<> -inline Stream *NewStream(void) { - Stream *st = new Stream(); - cudaError_t err = cudaStreamCreate(&st->stream_); - utils::Check(err == cudaSuccess, cudaGetErrorString(err)); - return st; -} -template<> -inline void DeleteStream(Stream *stream) { - cudaError_t err = cudaStreamDestroy(stream->stream_); - utils::Check(err == cudaSuccess, cudaGetErrorString(err)); - delete stream; -} - #if (MSHADOW_USE_NVML) inline int AutoSelectDevice(int device_count) { // TODO(bing): nvml device id and cuda device id are not consistent @@ -178,7 +129,7 @@ inline void MapExp(TRValue *dst, cuda::MapPlan(MakePlan(dst->self()), MakePlan(exp.self()), dshape.FlatTo2D(), - Stream::GetStream(StreamInfo::Get(dst->self()))); + Stream::GetStream(expr::StreamInfo::Get(dst->self()))); } template *dst, utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); cuda::MapReduceKeepLowest (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape, - StreamInfo::Get(dst->self())); + Stream::GetStream(expr::StreamInfo::Get(dst->self()))); } template *dst, // call equavalent map red dim 2 cuda::MapReduceKeepDim1 (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape, - Stream::GetStream(StreamInfo::Get(dst->self()))); + Stream::GetStream(expr::StreamInfo::Get(dst->self()))); } template inline void Softmax(Tensor dst, const Tensor& src) { - cuda::Softmax(dst, src, stream); + cuda::Softmax(dst, src); } } // namespace mshadow #endif // __CUDACC__ From c0201ccf3cbb5fa51400c6bcdc10fd3844c9c872 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 19:12:25 -0800 Subject: [PATCH 064/147] add in stream gpu --- mshadow/stream_gpu-inl.h | 62 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 mshadow/stream_gpu-inl.h diff --git a/mshadow/stream_gpu-inl.h b/mshadow/stream_gpu-inl.h new file mode 100644 index 000000000000..3b7bca6d1a4f --- /dev/null +++ b/mshadow/stream_gpu-inl.h @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file stream_gpu-inl.h + * \brief implementation of GPU code + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_STREAM_GPU_INL_H_ +#define MSHADOW_STREAM_GPU_INL_H_ +#include "./base.h" +#include "./tensor.h" +#include "./utils.h" + +namespace mshadow { +// Stream alocation +// actual implementation of GPU stream in CUDA +template<> +struct Stream { + /*! \brief cudaStream */ + cudaStream_t stream_; + /*! + * \brief wait for all the computation associated + * with this stream to complete + */ + inline void Wait(void) { + cudaError_t err = cudaStreamSynchronize(stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } + /*! + * \brief query whether the the stream is idle + * \return true if the stream is idle and all the job have been completed + */ + inline bool CheckIdle(void) { + cudaError_t err = cudaStreamQuery(stream_); + if (err == cudaSuccess) return true; + if (err == cudaErrorNotReady) return false; + utils::Error(cudaGetErrorString(err)); + return false; + } + /*! + * \brief returns actual cudaStream_t given an input GPU stream pointer + * \param stream pointer to GPU stream + */ + inline static cudaStream_t GetStream(Stream *stream) { + if (stream == NULL) return 0; + else return stream->stream_; + } +}; +template<> +inline Stream *NewStream(void) { + Stream *st = new Stream(); + cudaError_t err = cudaStreamCreate(&st->stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + return st; +} +template<> +inline void DeleteStream(Stream *stream) { + cudaError_t err = cudaStreamDestroy(stream->stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + delete stream; +} +} +#endif // MSHADOW_STREAM_GPU_INL_H_ From 0a8f810a21a6ea29e90a881ef51e9aaca94d6525 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sun, 4 Jan 2015 20:24:23 -0800 Subject: [PATCH 065/147] add cuda flag --- mshadow/stream_gpu-inl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mshadow/stream_gpu-inl.h b/mshadow/stream_gpu-inl.h index 3b7bca6d1a4f..e7d18aa42001 100644 --- a/mshadow/stream_gpu-inl.h +++ b/mshadow/stream_gpu-inl.h @@ -11,6 +11,7 @@ #include "./utils.h" namespace mshadow { +#if MSHADOW_USE_CUDA==1 // Stream alocation // actual implementation of GPU stream in CUDA template<> @@ -58,5 +59,6 @@ inline void DeleteStream(Stream *stream) { utils::Check(err == cudaSuccess, cudaGetErrorString(err)); delete stream; } +#endif } #endif // MSHADOW_STREAM_GPU_INL_H_ From 6f4de5c3e7d15e958bcd6305c1e2c76d07f44871 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 20:45:59 -0800 Subject: [PATCH 066/147] fix concat stream --- example/basic.cpp | 1 - mshadow/extension/concat.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/example/basic.cpp b/example/basic.cpp index d9780502a83e..09f9dcbccc80 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -23,7 +23,6 @@ int main(void) { ts.stream_ = NewStream(); DeleteStream(ts.stream_); - mat2 = rnd.uniform(mat2.shape_); // shape of matrix, note shape order is different from numpy // shape[i] indicate the shape of i-th dimension diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index 465755638dae..f2d06ecb680e 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -78,6 +78,16 @@ struct ShapeCheck >{ return t.shape_; } }; +template +struct StreamInfo >{ + inline static Stream *Get(const ConcatExp &t) { + Stream *lhs = StreamInfo::Get(t.src1_); + Stream *rhs = StreamInfo::Get(t.src2_); + if (lhs != rhs) return NULL; + return lhs; + } +}; // static typecheck template From e895fd852f86472d38916e4a300ba7221be54296 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sun, 4 Jan 2015 20:58:20 -0800 Subject: [PATCH 067/147] concat test --- example/concat.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 example/concat.cpp diff --git a/example/concat.cpp b/example/concat.cpp new file mode 100644 index 000000000000..2a200386dd8c --- /dev/null +++ b/example/concat.cpp @@ -0,0 +1,30 @@ +#include "mshadow/tensor.h" + +using namespace mshadow; +using namespace mshadow::expr; + + +void Print2D(const Tensor&t) { + for (int i = 0; i < t.size(0); ++i) { + for (int j = 0; j < t.size(1); ++j) { + printf("%.2f ", t[i][j]); + } + printf("\n"); + } +} + +int main() { + Tensor t1 = NewTensor(Shape4(2, 2, 3,2), 0.1f); + Tensor t2 = NewTensor(Shape4(2, 3, 3,2), 0.2f); + Tensor t3 = NewTensor(Shape4(2, 1, 3,2), 0.3f); + Tensor t = NewTensor(Shape4(2,6,3,2), 0.0f); + t = concat(t1, concat(t2, t3)); + Print2D(t[0][2]); + t += 1.0f; + concat(t1, concat(t2, t3)) = t; + Print2D(t3[1][0]); + FreeSpace(&t1); + FreeSpace(&t2); + FreeSpace(&t3); + FreeSpace(&t); +} From 4170eefa227bc3355ba38aac65aaa4bbd6567558 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 20:59:50 -0800 Subject: [PATCH 068/147] change eval to reval --- mshadow/cuda/tensor_gpu-inl.cuh | 10 +++++----- mshadow/expr_engine-inl.h | 4 ++-- mshadow/extension/concat.h | 6 +++--- mshadow/tensor_cpu-inl.h | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 908877cabd9f..e0b61ff22d30 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -59,7 +59,7 @@ __device__ void MapPlanProc(DstPlan dst, index_t xstride, const int y = tid / xstride; const int x = tid % xstride; if (y < dshape[0] && x < dshape[1]) { - Saver::Save(dst.Eval(y, x), exp.Eval(y,x)); + Saver::Save(dst.REval(y, x), exp.Eval(y,x)); } } template(s_rec); if (threadIdx.x == 0) { - Saver::Save(dst.Eval(0, c), s_rec[0] * scale); + Saver::Save(dst.REval(0, c), s_rec[0] * scale); } } @@ -223,7 +223,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { DType p = expf(src.Eval(y, x + threadIdx.x) - smax); s_rec[threadIdx.x] += p; // write back first, will fetch later - dst.Eval(y, x + threadIdx.x) = p; + dst.REval(y, x + threadIdx.x) = p; } } // calculate normalizer @@ -234,7 +234,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { for (unsigned x = 0; x < xmax; x += x_size) { if (x + threadIdx.x < xmax) { - dst.Eval(y, x + threadIdx.x) /= ssum; + dst.REval(y, x + threadIdx.x) /= ssum; } } } diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h index e80546c0b51a..b6ed59048a82 100644 --- a/mshadow/expr_engine-inl.h +++ b/mshadow/expr_engine-inl.h @@ -52,7 +52,7 @@ class Plan, DType> { explicit Plan(const Tensor &t) : dptr_(t.dptr_), stride_(t.stride_) {} // for RValue, the return type should be reference - MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { + MSHADOW_XINLINE DType &REval(index_t y, index_t x) { return dptr_[y * stride_ + x]; } // const evaluation @@ -69,7 +69,7 @@ template class Plan, DType> { public: explicit Plan(const Tensor &t) : dptr_(t.dptr_) {} - MSHADOW_XINLINE DType &Eval(index_t y, index_t x) { + MSHADOW_XINLINE DType &REval(index_t y, index_t x) { return dptr_[x]; } MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const { diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index f2d06ecb680e..8d93b1474056 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -120,13 +120,13 @@ struct Plan, DType> { if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); else return src2_.Eval((c - ch_src1_) * height_ + y, x); } - MSHADOW_XINLINE DType &Eval(index_t i, index_t j) { + MSHADOW_XINLINE DType &REval(index_t i, index_t j) { const index_t y = i % height_; i /= height_; const index_t c = i % ch_; const index_t x = j; - if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); - else return src2_.Eval((c - ch_src1_) * height_ + y, x); + if (c < ch_src1_) return src1_.REval(c * height_ + y, x); + else return src2_.REval((c - ch_src1_) * height_ + y, x); } private: Plan src1_; diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 3bc46e1a0d71..33174171b718 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -69,7 +69,7 @@ inline void MapPlan(TRValue *dst, for (index_t y = 0; y < shape[0]; ++y) { for (index_t x = 0; x < shape[1]; ++x) { // trust your compiler! -_- they will optimize it - Saver::Save(dplan.Eval(y, x), plan.Eval(y, x)); + Saver::Save(dplan.REval(y, x), plan.Eval(y, x)); } } } @@ -139,7 +139,7 @@ inline void MapReduceKeepLowest(TRValue *dst, for (index_t y = 1; y < eshape[0]; ++y) { Reducer::Reduce(res, splan.Eval(y, x)); } - Saver::Save(dplan.Eval(0, x), res * scale); + Saver::Save(dplan.REval(0, x), res * scale); } } @@ -176,7 +176,7 @@ inline void MapReduceKeepHighDim(TRValue *dst, } Reducer::Reduce(res, tres); } - Saver::Save(dplan.Eval(0, c), res * scale); + Saver::Save(dplan.REval(0, c), res * scale); } } From 3521d0d745df5a290daf0fba9b7001e4c148146b Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 21:22:19 -0800 Subject: [PATCH 069/147] fix concat --- example/Makefile | 3 ++- mshadow/extension/concat.h | 14 ++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/example/Makefile b/example/Makefile index 51e6588bc94b..37c300598dea 100644 --- a/example/Makefile +++ b/example/Makefile @@ -7,7 +7,7 @@ export LDFLAGS= -lm export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) # specify tensor path -BIN = basic defop +BIN = basic defop concat OBJ = CUOBJ = CUBIN = basicx @@ -16,6 +16,7 @@ CUBIN = basicx all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) basic: basic.cpp +concat: concat.cpp defop: defop.cpp basicx: basic.cu diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index 8d93b1474056..d1546abb8709 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -45,6 +45,10 @@ struct ConcatExp : public TRValue &exp) { this->__assign(exp); } + inline void + operator=(const DType &exp) { + this->__assign(exp); + } }; // struct ConcatExp /*! * \brief concat two 4D tensor @@ -116,17 +120,19 @@ struct Plan, DType> { const index_t y = i % height_; i /= height_; const index_t c = i % ch_; + const index_t b = i / ch_; const index_t x = j; - if (c < ch_src1_) return src1_.Eval(c * height_ + y, x); - else return src2_.Eval((c - ch_src1_) * height_ + y, x); + if (c < ch_src1_) return src1_.Eval((b * ch_ + c) * height_ + y, x); + else return src2_.Eval((b * ch_ + c - ch_src1_) * height_ + y, x); } MSHADOW_XINLINE DType &REval(index_t i, index_t j) { const index_t y = i % height_; i /= height_; const index_t c = i % ch_; + const index_t b = i / ch_; const index_t x = j; - if (c < ch_src1_) return src1_.REval(c * height_ + y, x); - else return src2_.REval((c - ch_src1_) * height_ + y, x); + if (c < ch_src1_) return src1_.REval((b * ch_ + c) * height_ + y, x); + else return src2_.REval((b * ch_ + c - ch_src1_) * height_ + y, x); } private: Plan src1_; From 4fac292a048cbba6b3540bb1759715670b932367 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 21:27:23 -0800 Subject: [PATCH 070/147] fix --- mshadow/extension/concat.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index d1546abb8709..6fc119126cfb 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -122,8 +122,8 @@ struct Plan, DType> { const index_t c = i % ch_; const index_t b = i / ch_; const index_t x = j; - if (c < ch_src1_) return src1_.Eval((b * ch_ + c) * height_ + y, x); - else return src2_.Eval((b * ch_ + c - ch_src1_) * height_ + y, x); + if (c < ch_src1_) return src1_.Eval((b * ch_src1_ + c) * height_ + y, x); + else return src2_.Eval((b * ch_src2_ + c - ch_src1_) * height_ + y, x); } MSHADOW_XINLINE DType &REval(index_t i, index_t j) { const index_t y = i % height_; @@ -131,8 +131,8 @@ struct Plan, DType> { const index_t c = i % ch_; const index_t b = i / ch_; const index_t x = j; - if (c < ch_src1_) return src1_.REval((b * ch_ + c) * height_ + y, x); - else return src2_.REval((b * ch_ + c - ch_src1_) * height_ + y, x); + if (c < ch_src1_) return src1_.REval((b * ch_src1_ + c) * height_ + y, x); + else return src2_.REval((b * ch_src2_ + c - ch_src1_) * height_ + y, x); } private: Plan src1_; From c42ef27a3242ec8698858072b58def0bb7d46c40 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 4 Jan 2015 21:52:35 -0800 Subject: [PATCH 071/147] arbitary concat --- example/concat.cpp | 11 ++-- mshadow/extension/broadcast.h | 3 +- mshadow/extension/concat.h | 108 ++++++++++++++++++++++------------ 3 files changed, 78 insertions(+), 44 deletions(-) diff --git a/example/concat.cpp b/example/concat.cpp index 2a200386dd8c..16f5b77a0720 100644 --- a/example/concat.cpp +++ b/example/concat.cpp @@ -15,13 +15,16 @@ void Print2D(const Tensor&t) { int main() { Tensor t1 = NewTensor(Shape4(2, 2, 3,2), 0.1f); - Tensor t2 = NewTensor(Shape4(2, 3, 3,2), 0.2f); + Tensor t2 = NewTensor(Shape4(2, 2, 3,2), 0.2f); Tensor t3 = NewTensor(Shape4(2, 1, 3,2), 0.3f); - Tensor t = NewTensor(Shape4(2,6,3,2), 0.0f); - t = concat(t1, concat(t2, t3)); + Tensor t = NewTensor(Shape4(2,5,3,2), 0.0f); + Tensor tr = NewTensor(Shape4(2,2,3,4), 0.0f); + t = concat<1>(t1, concat<1>(t2, t3)); + tr = concat<3>(t1, t2); Print2D(t[0][2]); + Print2D(tr[0][2]); t += 1.0f; - concat(t1, concat(t2, t3)) = t; + concat<1>(t1, concat<1>(t2, t3)) = t; Print2D(t3[1][0]); FreeSpace(&t1); FreeSpace(&t2); diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h index a163a5011212..9a8b57bffc7d 100644 --- a/mshadow/extension/broadcast.h +++ b/mshadow/extension/broadcast.h @@ -69,8 +69,7 @@ repmat(const expr::Exp &src, index_t nrow) { //---------------------- // Execution plan //---------------------- -template +template struct Plan, DType> { public: static const int dimcast = dimdst - dimdst_m_cast; diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index 6fc119126cfb..3aa1e3123182 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -11,34 +11,35 @@ namespace expr { * \tparam RhsExp right expression * \tparam DType the type of elements * \tparam srcdim dimension of src + * \tparam dimsrc_m_cat dimsrc - dimcat */ template + typename Device, typename DType, + int srcdim, int dimsrc_m_cat> struct ConcatExp : public TRValue, + Device, DType, + srcdim, dimsrc_m_cat>, Device, srcdim, DType> { + static const int dimcat = srcdim - dimsrc_m_cat; const LhsExp &src1_; const RhsExp &src2_; - index_t height_; - index_t width_; - index_t ch_src1_; - index_t ch_src2_; + index_t dcat_src1_; + index_t dcat_src2_; Shape<4> shape_; ConcatExp(const LhsExp &src1, const RhsExp &src2) : src1_(src1), src2_(src2) { Shape sshape1 = ShapeCheck::Check(src1_); Shape sshape2 = ShapeCheck::Check(src2_); - utils::Check(sshape1[srcdim - 2] == sshape2[srcdim - 2], - "ConcatExp: height requirement not met"); - utils::Check(sshape1[srcdim - 1] == sshape2[srcdim - 1], - "ConcatExp: width requirement not met"); - utils::Check(sshape1[0] == sshape2[0], - "ConcatExp: batch requirement not met"); + #pragma unroll + for (int i = 0; i < srcdim; ++i) { + if (i != dimcat) { + utils::Check(sshape1[i] == sshape2[i], + "ConcatExp: shape mismatch"); + } + } this->shape_ = sshape1; - this->shape_[1] = sshape1[1] + sshape2[1]; - this->ch_src1_ = sshape1[1]; - this->ch_src2_ = sshape2[1]; - this->height_ = sshape1[2]; - this->width_ = sshape1[3]; + this->shape_[dimcat] = sshape1[dimcat] + sshape2[dimcat]; + this->dcat_src1_ = sshape1[dimcat]; + this->dcat_src2_ = sshape2[dimcat]; } template inline void @@ -55,20 +56,21 @@ struct ConcatExp : public TRValue -inline ConcatExp::kDim> +inline ConcatExp concat(const TRValue &src1, const TRValue &src2) { - TypeCheckPass::kDim == 4> - ::Error_Expression_Does_Not_Meet_Dimension_Req(); - TypeCheckPass::kDim == 4> + TypeCheckPass::kDim == ExpInfo::kDim> ::Error_Expression_Does_Not_Meet_Dimension_Req(); - return ConcatExp::kDim> + TypeCheckPass::kDim == srcdim> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ConcatExp (src1.self(), src2.self()); } //------------------------ @@ -76,16 +78,19 @@ concat(const TRValue &src1, //------------------------ // runtime shapecheck template -struct ShapeCheck >{ - inline static Shape Check(const ConcatExp &t) { + typename Device, typename DType, + int srcdim, int dimsrc_m_cat> +struct ShapeCheck >{ + inline static Shape Check(const ConcatExp &t) { return t.shape_; } }; template -struct StreamInfo >{ - inline static Stream *Get(const ConcatExp &t) { + typename Device, typename DType, + int srcdim, int dimsrc_m_cat> +struct StreamInfo >{ + inline static Stream * + Get(const ConcatExp &t) { Stream *lhs = StreamInfo::Get(t.src1_); Stream *rhs = StreamInfo::Get(t.src2_); if (lhs != rhs) return NULL; @@ -94,8 +99,9 @@ struct StreamInfo >{ }; // static typecheck template -struct ExpInfo >{ + typename Device, typename DType, + int srcdim, int dimsrc_m_cat> +struct ExpInfo >{ static const int kDimLhs = ExpInfo::kDim; static const int kDimRhs = ExpInfo::kDim; // copy from binarymap @@ -109,13 +115,15 @@ struct ExpInfo >{ // Execution plan //--------------------- template -struct Plan, DType> { + typename Device, typename DType, + int srcdim, int dimsrc_m_cat> +struct Plan, DType> { public: - explicit Plan(const ConcatExp &e) : - src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), - height_(e.height_), width_(e.width_), - ch_src1_(e.ch_src1_), ch_src2_(e.ch_src2_), ch_(e.ch_src1_ + e.ch_src2_) {} + static const int dimcat = srcdim - dimsrc_m_cat; + explicit Plan(const ConcatExp &e) + : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), + height_(e.shape_.ProdShape(dimcat + 1, srcdim - 1)), + ch_src1_(e.dcat_src1_), ch_src2_(e.dcat_src2_), ch_(e.shape_[dimcat]) {} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { const index_t y = i % height_; i /= height_; @@ -134,12 +142,36 @@ struct Plan, DType> { if (c < ch_src1_) return src1_.REval((b * ch_src1_ + c) * height_ + y, x); else return src2_.REval((b * ch_src2_ + c - ch_src1_) * height_ + y, x); } + private: Plan src1_; Plan src2_; - const index_t height_, width_, ch_src1_, ch_src2_, ch_; + const index_t height_, ch_src1_, ch_src2_, ch_; }; // struct Plan +// specialize for concat in x +template +struct Plan, DType> { + public: + explicit Plan(const ConcatExp &e) + : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), + width_src1_(e.dcat_src1_) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + if (x < width_src1_) return src1_.Eval(y, x); + else return src2_.Eval(y, x - width_src1_); + } + MSHADOW_XINLINE DType &REval(index_t y, index_t x) { + if (x < width_src1_) return src1_.REval(y, x); + else return src2_.REval(y, x - width_src1_); + } + + private: + Plan src1_; + Plan src2_; + const index_t width_src1_; +}; }// namespace expr } // namespace mshadow #endif // MSHADOW_EXTENSION_CONCAT_H_ From 3f4a2176f99ad897c31b345f4196a40ff007cc3d Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 5 Jan 2015 00:40:08 -0800 Subject: [PATCH 072/147] fix sign bug --- mshadow/extension/channel_pool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h index 973dbb49ab82..e4f7c720535d 100644 --- a/mshadow/extension/channel_pool.h +++ b/mshadow/extension/channel_pool.h @@ -41,8 +41,8 @@ struct ChannelPoolingExp: /*! * \brief channel pooling, do reduction over (local nearby) channels, * used to implement local response normalization - * \param src source data - * \param nsize neighbor size + * \param src source data + * \param nsize neighbor size * \return expression of pooled result * \tparam Reducer reducer type * \tparam SrcExp source expression @@ -86,7 +86,7 @@ struct Plan, DType> { const index_t c = i % channel_; const index_t n = i / channel_; const index_t x = j; - const index_t cstart = c * stride_ - pad_ < 0 ? 0 : c * stride_ - pad_; + const index_t cstart = static_cast(c * stride_ - pad_) < 0 ? 0 : c * stride_ - pad_; const index_t cend = min(cstart + hnsize_, channel_); DType res; Reducer::SetInitValue(res); for (index_t cc = cstart; cc < cend; ++cc) { From bd413d022655bab43e3f394487c482ccfb2dd437 Mon Sep 17 00:00:00 2001 From: winsty Date: Sun, 4 Jan 2015 13:45:13 +0800 Subject: [PATCH 073/147] add stride and pad to channel pooling --- mshadow/extension/channel_pool.h | 32 +++++-- mshadow/extension/channel_unpool.h | 132 +++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 9 deletions(-) create mode 100644 mshadow/extension/channel_unpool.h diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h index 0b4b3cbf8a2b..973dbb49ab82 100644 --- a/mshadow/extension/channel_pool.h +++ b/mshadow/extension/channel_pool.h @@ -26,11 +26,13 @@ struct ChannelPoolingExp: const SrcExp &src_; /*! \brief neighbor size */ index_t nsize_; + /*! \brief stride of pooling */ + index_t stride_; + /*! \brief pad of pooling of each side */ + index_t pad_; /*! \brief constructor */ - ChannelPoolingExp(const SrcExp &src, index_t nsize) - : src_(src), nsize_(nsize) { - utils::Check(nsize % 2 == 1, - "chpool: local size must be odd"); + ChannelPoolingExp(const SrcExp &src, index_t nsize, index_t stride, index_t pad) + : src_(src), nsize_(nsize), stride_(stride), pad_(pad) { this->shape_ = ShapeCheck::Check(src_); utils::Check(this->shape_[srcdim - 3] >= nsize_, "chpool: local size must be smaller than nchannels"); @@ -52,9 +54,21 @@ inline ChannelPoolingExp::kDim> chpool(const Exp &src, index_t nsize) { TypeCheckPass::kDim >= 3> ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(nsize % 2 == 1, + "chpool: if no pad is specified, local size must be odd"); return ChannelPoolingExp::kDim>(src.self(), nsize); + DType, ExpInfo::kDim>(src.self(), nsize, 1, nsize / 2); } + +template +inline ChannelPoolingExp::kDim> +chpool(const Exp &src, index_t nsize, index_t stride, index_t pad) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelPoolingExp::kDim>(src.self(), nsize, stride, pad); +} + //---------------------- // Execution plan //---------------------- @@ -64,7 +78,7 @@ struct Plan, DType> { explicit Plan(const ChannelPoolingExp &e) : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]), height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]), - hnsize_(e.nsize_ / 2) {} + hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_){} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { using namespace std; const index_t y = i % height_; @@ -72,8 +86,8 @@ struct Plan, DType> { const index_t c = i % channel_; const index_t n = i / channel_; const index_t x = j; - const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; - const index_t cend = min(c + hnsize_ + 1, channel_); + const index_t cstart = c * stride_ - pad_ < 0 ? 0 : c * stride_ - pad_; + const index_t cend = min(cstart + hnsize_, channel_); DType res; Reducer::SetInitValue(res); for (index_t cc = cstart; cc < cend; ++cc) { Reducer::Reduce(res, src_.Eval((n * channel_ + cc) * height_ + y, x)); @@ -82,7 +96,7 @@ struct Plan, DType> { } private: Plan src_; - const index_t channel_, height_, width_, hnsize_; + const index_t channel_, height_, width_, hnsize_, stride_, pad_; }; } // namespace expr } // namespace mshadow diff --git a/mshadow/extension/channel_unpool.h b/mshadow/extension/channel_unpool.h new file mode 100644 index 000000000000..7b66b593476f --- /dev/null +++ b/mshadow/extension/channel_unpool.h @@ -0,0 +1,132 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file channel_pool.h + * \brief support for chpool + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_ +#define MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief channel pooling expression, do reduction over (local nearby) channels, + * used to implement local response normalization + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct ChannelUnpoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source input, corresponds to src in pooling */ + const SrcExp &data_src_; + /*! \brief result of pooled data, corresponds to result of pooling */ + const SrcExp &data_pooled_; + /*! \brief gradient data of pooled part, to be propgate down */ + const SrcExp &grad_pooled_; + /*! \brief channel of pooled expression */ + index_t pchannel_; + /*! \brief kernel size in height */ + index_t nsize_; + /*! \brief kernel size in width */ + index_t kstride_; + /*! \brief pad */ + index_t pad_; + /*! \brief constructor */ + ChannelUnpoolingExp(const SrcExp &data_src, + const SrcExp &data_pooled, + const SrcExp &grad_pooled, + index_t nsize, index_t kstride, index_t pad) + : data_src_(data_src), data_pooled_(data_pooled), + grad_pooled_(grad_pooled), + nsize_(nsize), kstride_(kstride), pad_(pad) { + Shape pshape = ShapeCheck::Check(grad_pooled); + utils::Check(pshape == ShapeCheck::Check(data_pooled), + "ChannelUnPoolingExp: data and grad shape mismatch"); + Shape sshape = ShapeCheck::Check(data_src); + for (int k = 0; k < srcdim; ++k) { + if (k == 1){ + continue; + } + utils::Check(pshape[k] == sshape[k], + "ChannelUnPoolingExp: pooled tensor and src tensor shape mismatch"); + } + pchannel_ = pshape[1]; + this->shape_ = sshape; + } +}; +/*! + * \brief channel unpooling, do unroll over (local nearby) channels + * \param src source data + * \param nsize neighbor size + * \param stride stride of the pooling + * \param pad number of padding at each side + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline ChannelUnpoolingExp::kDim> +ch_unpool(const Exp &data_src, + const Exp &data_pooled, + const Exp &grad_pooled, + index_t nsize, index_t stride, index_t pad) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelUnpoolingExp::kDim> + (data_src.self(), data_pooled.self(), grad_pooled.self(), nsize, stride, pad); +} + +template +inline ChannelUnpoolingExp::kDim> +ch_unpool(const Exp &data_src, + const Exp &data_pooled, + const Exp &grad_pooled, index_t nsize) { + return ch_unpool(data_src, data_pooled, grad_pooled, nsize, 1, nsize / 2); +} + + +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ChannelUnpoolingExp &e) + : data_src_(e.data_src_), data_pooled_(e.data_pooled_), + grad_pooled_(e.grad_pooled_), channel_(e.shape_[srcdim - 3]), + height_(e.shape_[srcdim - 2]), pchannel_(e.pchannel_), + hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_){} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using namespace std; + const DType vsrc = data_src_.Eval(i, j); + const index_t y = i % height_; + i /= height_; + const index_t c = i % channel_; + const index_t n = i / channel_; + const index_t x = j; + const index_t cstart = c < hnsize_ - pad_ ? 0 + : (c - (hnsize_ - pad_) + stride_) / stride_; + const index_t cend = min((c + pad + stride_) / stride_, channel_); + DType val = static_cast(0); + for (index_t cc = cstart; cc < cend; ++cc) { + val += Reducer::PartialGrad(vsrc, + data_pooled_.Eval(n * pchannel_ + cc) * height_ + y, x) * + grad_pooled_.Eval(n * pchannel_ + cc) * height_ + y, x); + } + return res; + } + private: + Plan data_src_, data_pooled_, grad_pooled_; + const index_t channel_, height_, pchannel_, hnsize_, stride_, pad_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CHANNEL_POOL_H_ + From e9b426e970f7f771fcc158bcb30c3d2dc0600057 Mon Sep 17 00:00:00 2001 From: winsty Date: Tue, 6 Jan 2015 02:34:43 +0800 Subject: [PATCH 074/147] check in channel pool/unpool. UNTESTED --- mshadow/extension.h | 1 + mshadow/extension/channel_pool.h | 12 ++++++++---- mshadow/extension/channel_unpool.h | 8 ++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mshadow/extension.h b/mshadow/extension.h index 865d8fe5bc5b..882d367a72dd 100644 --- a/mshadow/extension.h +++ b/mshadow/extension.h @@ -17,6 +17,7 @@ #include "./extension/spatial_pool.h" #include "./extension/spatial_unpool.h" #include "./extension/channel_pool.h" +#include "./extension/channel_unpool.h" #include "./extension/pad.h" #include "./extension/crop.h" #include "./extension/mirror.h" diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h index e4f7c720535d..4039d1d6303a 100644 --- a/mshadow/extension/channel_pool.h +++ b/mshadow/extension/channel_pool.h @@ -30,12 +30,15 @@ struct ChannelPoolingExp: index_t stride_; /*! \brief pad of pooling of each side */ index_t pad_; + index_t src_channel_; /*! \brief constructor */ ChannelPoolingExp(const SrcExp &src, index_t nsize, index_t stride, index_t pad) : src_(src), nsize_(nsize), stride_(stride), pad_(pad) { this->shape_ = ShapeCheck::Check(src_); + this->src_channel_ = this->shape_[srcdim - 3]; utils::Check(this->shape_[srcdim - 3] >= nsize_, "chpool: local size must be smaller than nchannels"); + this->shape_[srcdim - 3] = (this->src_channel_ - nsize + pad * 2 + 1) / stride; } }; /*! @@ -78,7 +81,8 @@ struct Plan, DType> { explicit Plan(const ChannelPoolingExp &e) : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]), height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]), - hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_){} + hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_), + src_channel_(e.src_channel_){} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { using namespace std; const index_t y = i % height_; @@ -86,17 +90,17 @@ struct Plan, DType> { const index_t c = i % channel_; const index_t n = i / channel_; const index_t x = j; - const index_t cstart = static_cast(c * stride_ - pad_) < 0 ? 0 : c * stride_ - pad_; + const index_t cstart = c * stride_ < pad_ ? 0 : c * stride_ - pad_; const index_t cend = min(cstart + hnsize_, channel_); DType res; Reducer::SetInitValue(res); for (index_t cc = cstart; cc < cend; ++cc) { - Reducer::Reduce(res, src_.Eval((n * channel_ + cc) * height_ + y, x)); + Reducer::Reduce(res, src_.Eval((n * src_channel_ + cc) * height_ + y, x)); } return res; } private: Plan src_; - const index_t channel_, height_, width_, hnsize_, stride_, pad_; + const index_t channel_, height_, width_, hnsize_, stride_, pad_, src_channel_; }; } // namespace expr } // namespace mshadow diff --git a/mshadow/extension/channel_unpool.h b/mshadow/extension/channel_unpool.h index 7b66b593476f..8646e8efbe33 100644 --- a/mshadow/extension/channel_unpool.h +++ b/mshadow/extension/channel_unpool.h @@ -113,14 +113,14 @@ struct Plan, DType> { const index_t x = j; const index_t cstart = c < hnsize_ - pad_ ? 0 : (c - (hnsize_ - pad_) + stride_) / stride_; - const index_t cend = min((c + pad + stride_) / stride_, channel_); + const index_t cend = min((c + pad_ + stride_) / stride_, channel_); DType val = static_cast(0); for (index_t cc = cstart; cc < cend; ++cc) { val += Reducer::PartialGrad(vsrc, - data_pooled_.Eval(n * pchannel_ + cc) * height_ + y, x) * - grad_pooled_.Eval(n * pchannel_ + cc) * height_ + y, x); + data_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x) * + grad_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x) ); } - return res; + return val; } private: Plan data_src_, data_pooled_, grad_pooled_; From 1ac42b0d41bf720b4cb62768ef0f64204defaf06 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 5 Jan 2015 16:39:49 -0800 Subject: [PATCH 075/147] add stream NULL --- mshadow/tensor.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 841a1d7d34a2..2f032ba66489 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -261,9 +261,10 @@ struct Tensor: public TRValue, // functions //-------------------------------- /*! \brief default constructor */ - MSHADOW_XINLINE Tensor(void) {} + MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} /*! \brief constructor from shape */ - MSHADOW_XINLINE Tensor(const Shape &shape) : shape_(shape) {} + MSHADOW_XINLINE Tensor(const Shape &shape) + : shape_(shape), stream_(NULL) {} /*! \brief constructor from data pointer and shape, without stride */ MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape) : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {} @@ -359,8 +360,9 @@ struct Tensor: index_t stride_; Stream *stream_; // constructor - MSHADOW_XINLINE Tensor(void) {} - MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape_(shape) {} + MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} + MSHADOW_XINLINE Tensor(const Shape<1> &shape) + : shape_(shape), stream(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, index_t stride) From 0ecee9fb43771888d31cef1ee51706344df20a84 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Mon, 5 Jan 2015 16:47:56 -0800 Subject: [PATCH 076/147] typo --- mshadow/tensor.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 2f032ba66489..8eacb8f6f290 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -5,7 +5,7 @@ * This lib requires explicit memory allocation and de-allocation * all the data structure Tensor, Tensor are like handles(pointers), * no memory allocation is happening during calculation - * + * * For STL style tensor, see tensor_container.h * \author Bing Xu, Tianqi Chen */ @@ -206,7 +206,7 @@ struct Stream { /*! * \brief query whether the the stream is idle * \return true if the stream is idle and all the job have been completed - */ + */ inline bool CheckIdle(void) { return true; } @@ -218,7 +218,7 @@ struct Stream { * \tparam dimension dimension of the tensor * \tparam DType the type of elements in the tensor */ -template +template struct TRValue: public expr::RValueExp { }; // more compact template @@ -252,8 +252,8 @@ struct Tensor: public TRValue, * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency */ index_t stride_; - /*! - * \brief stream where the computation lies + /*! + * \brief stream where the computation lies * stream is a device dependency concept where each computation */ Stream *stream_; @@ -263,7 +263,7 @@ struct Tensor: public TRValue, /*! \brief default constructor */ MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} /*! \brief constructor from shape */ - MSHADOW_XINLINE Tensor(const Shape &shape) + MSHADOW_XINLINE Tensor(const Shape &shape) : shape_(shape), stream_(NULL) {} /*! \brief constructor from data pointer and shape, without stride */ MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape) @@ -273,7 +273,7 @@ struct Tensor: public TRValue, const Shape &shape, index_t stride) : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} /*! - * \return memory cost of the tensor, including the aligned x dimension + * \return memory cost of the tensor, including the aligned x dimension * \tparam startdim the starting dimension */ template @@ -286,7 +286,7 @@ struct Tensor: public TRValue, return memsz; } /*! - * \return memory cost of the tensor, including the aligned x dimension + * \return memory cost of the tensor, including the aligned x dimension */ MSHADOW_XINLINE size_t MSize(void) const { return this->MemSize<0>(); @@ -362,7 +362,7 @@ struct Tensor: // constructor MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} MSHADOW_XINLINE Tensor(const Shape<1> &shape) - : shape_(shape), stream(NULL) {} + : shape_(shape), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, index_t stride) From 649f50349a424d61f96ed78087f5d68a2c752981 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 5 Jan 2015 18:19:12 -0800 Subject: [PATCH 077/147] checkin ps interface --- mshadow-ps/.gitignore | 3 + mshadow-ps/Makefile | 33 ++++++++ mshadow-ps/README.md | 1 + mshadow-ps/ps.h | 111 ++++++++++++++++++++++++ mshadow-ps/ps_mthread-inl.h | 13 +++ mshadow-ps/thread.h | 164 ++++++++++++++++++++++++++++++++++++ mshadow/tensor.h | 2 +- 7 files changed, 326 insertions(+), 1 deletion(-) create mode 100644 mshadow-ps/.gitignore create mode 100644 mshadow-ps/Makefile create mode 100644 mshadow-ps/README.md create mode 100644 mshadow-ps/ps.h create mode 100644 mshadow-ps/ps_mthread-inl.h create mode 100644 mshadow-ps/thread.h diff --git a/mshadow-ps/.gitignore b/mshadow-ps/.gitignore new file mode 100644 index 000000000000..076c1aa82e8b --- /dev/null +++ b/mshadow-ps/.gitignore @@ -0,0 +1,3 @@ +Makefile +test +test.cpp diff --git a/mshadow-ps/Makefile b/mshadow-ps/Makefile new file mode 100644 index 000000000000..375c915cc933 --- /dev/null +++ b/mshadow-ps/Makefile @@ -0,0 +1,33 @@ +# set LD_LIBRARY_PATH +export CC = gcc +export CXX = clang++ +export NVCC =nvcc +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 +export LDFLAGS= -lm +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) + +# specify tensor path +BIN = test +OBJ = +CUOBJ = +CUBIN = +.PHONY: clean all + +all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) + +test: test.cpp + +$(BIN) : + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) + +$(OBJ) : + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) + +$(CUOBJ) : + $(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^) + +$(CUBIN) : + $(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^) + +clean: + $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ diff --git a/mshadow-ps/README.md b/mshadow-ps/README.md new file mode 100644 index 000000000000..e92ec6b95e58 --- /dev/null +++ b/mshadow-ps/README.md @@ -0,0 +1 @@ +This folder contains parameter server abstraction for mshadow Tensor. diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h new file mode 100644 index 000000000000..88210d7fe9f8 --- /dev/null +++ b/mshadow-ps/ps.h @@ -0,0 +1,111 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps.h + * \brief parameter server abstraction for mshadow tensor + * this is a plugin of mshadow that can be used to syncrhonize + * parameters across device and machines + * + * \author Tianqi Chen, Mu Li + */ +#ifndef MSHADOW_PS_H_ +#define MSHADOW_PS_H_ +#include "../mshadow/tensor.h" + +namespace mshadow { +namespace ps { +/*! + * \brief interface of parameter server + * \tparam xpu the device of the data lies + * \tparam DType the type of element in the tensor + */ +template +class IParamServer { + /*! \brief virtual destructor */ + virtual ~IParamServer(void) {} + /*! + * \brief Set param for the layer from string + * \param name parameter name + * \param val string for configuration + */ + virtual void SetParam(const char *name, const char *val) {} + /*! + * \brief initialize the paramerver server client + * \param num_device number of parallel device + * we want to support in current process + * in the future, the device id must be in [0, num_device) + */ + virtual void Init(int num_device = 1) {} + /*! + * \brief wait until the pull event finishes + * + * \param devid the device id this tensor lies in + * \param key the unique key to indicate the tensor + * this is unique per device + * \param data the data + */ + virtual void PullWait(int devid, int key) = 0; + /*! + * \brief push out a tensor to parameter server + * this call is asynchronize and returns immediately + * + * \param data the data + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + template + inline void Push(mshadow::Tensor data, + int key, int devid = 0) { + this->Push_(data.FlatTo2D(), key, devid); + } + /*! + * \brief send a pull request, to pull parameter into data + * this call is asynchronize and returns immediately + * use PullWait to wait the event of copy finish + * + * \param data the data + * \param key the unique key to indicate the tensor, + * this is unique per device + * \param devid the device id this tensor lies in + */ + template + inline void PullReq(mshadow::Tensor data, + int key, int devid = 0) { + this->PullReq_(data, key, devid); + } + + protected: + /*! + * \brief push out a tensor to parameter server + * this call is asynchronize and returns immediately + * + * \param data the data + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void Push_(mshadow::Tensor data, + int key, int devid = 0) = 0; + /*! + * \brief send a pull request, to pull parameter into data + * this call is asynchronize and returns immediately + * use PullWait to wait the event of copy finish + * + * \param data the data + * \param key the unique key to indicate the tensor, + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void PullReq_(mshadow::Tensor data, + int key, int devid = 0) = 0; +}; +/*! + * \brief create a parameter server implementation + * \param type the type of paramerver server + */ +template +inline IParamServer *Create(const char *type); +} // namespace ps +} // namespace mshadow +#endif diff --git a/mshadow-ps/ps_mthread-inl.h b/mshadow-ps/ps_mthread-inl.h new file mode 100644 index 000000000000..9e0ee1124c64 --- /dev/null +++ b/mshadow-ps/ps_mthread-inl.h @@ -0,0 +1,13 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps_mthread-inl.h + * \brief multi-threading implementation of PS abstraction + * + * \author Tianqi Chen + */ +#ifndef MSHADOW_PS_MTHREAD_INL_H_ +#define MSHADOW_PS_MTHREAD_INL_H_ +#include "./thread.h" +#include "./ps.h" + +#endif // MSHADOW_PS_MTHREAD_INL_H_ diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h new file mode 100644 index 000000000000..67bf2a6fc92a --- /dev/null +++ b/mshadow-ps/thread.h @@ -0,0 +1,164 @@ +#ifndef MSHADOW_UTILS_THREAD_H_ +#define MSHADOW_UTILS_THREAD_H_ +/*! + * \file thread.h + * \brief this header include the minimum necessary resource for multi-threading that can be compiled in windows, linux, mac + * \author Tianqi Chen + */ +#ifdef _MSC_VER +#include "../mshadow/utils.h" +#include +#include +namespace mshadow { +namespace utils { +/*! \brief simple semaphore used for synchronization */ +class Semaphore { + public : + inline void Init(int init_val) { + sem = CreateSemaphore(NULL, init_val, 10, NULL); + utils::Check(sem != NULL, "create Semaphore error"); + } + inline void Destroy(void) { + CloseHandle(sem); + } + inline void Wait(void) { + utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error"); + } + inline void Post(void) { + utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error"); + } + private: + HANDLE sem; +}; +/*! \brief simple thread that wraps windows thread */ +class Thread { + private: + HANDLE thread_handle; + unsigned thread_id; + public: + inline void Start(unsigned int __stdcall entry(void*), void *param) { + thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id); + } + inline int Join(void) { + WaitForSingleObject(thread_handle, INFINITE); + return 0; + } +}; +/*! \brief exit function called from thread */ +inline void ThreadExit(void *status) { + _endthreadex(0); +} +#define MSHADOW_THREAD_PREFIX unsigned int __stdcall +} // namespace utils +} // namespace mshadow +#else +// thread interface using g++ +#include +#include +namespace mshadow { +namespace utils { +/*!\brief semaphore class */ +class Semaphore { + #ifdef __APPLE__ + private: + sem_t* semPtr; + char sema_name[20]; + private: + inline void GenRandomString(char *s, const int len) { + static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ; + for (int i = 0; i < len; ++i) { + s[i] = alphanum[rand() % (sizeof(alphanum) - 1)]; + } + s[len] = 0; + } + public: + inline void Init(int init_val) { + sema_name[0]='/'; + sema_name[1]='s'; + sema_name[2]='e'; + sema_name[3]='/'; + GenRandomString(&sema_name[4], 16); + if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) { + perror("sem_open"); + exit(1); + } + utils::Check(semPtr != NULL, "create Semaphore error"); + } + inline void Destroy(void) { + if (sem_close(semPtr) == -1) { + perror("sem_close"); + exit(EXIT_FAILURE); + } + if (sem_unlink(sema_name) == -1) { + perror("sem_unlink"); + exit(EXIT_FAILURE); + } + } + inline void Wait(void) { + sem_wait(semPtr); + } + inline void Post(void) { + sem_post(semPtr); + } + #else + private: + sem_t sem; + public: + inline void Init(int init_val) { + sem_init(&sem, 0, init_val); + } + inline void Destroy(void) { + sem_destroy(&sem); + } + inline void Wait(void) { + sem_wait(&sem); + } + inline void Post(void) { + sem_post(&sem); + } + #endif +}; + +// mutex that works with pthread +class Mutex { + public: + inline void Init(void) { + pthread_mutex_init(&mutex, NULL); + } + inline void Lock(void) { + pthread_mutex_lock(&mutex); + } + inline void Unlock(void) { + pthread_mutex_unlock(&mutex); + } + inline void Destroy(void) { + pthread_mutex_destroy(&mutex); + } + private: + pthread_mutex_t mutex; +}; + +/*!\brief simple thread class */ +class Thread { + private: + pthread_t thread; + public : + inline void Start(void * entry(void*), void *param) { + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + pthread_create(&thread, &attr, entry, param); + } + inline int Join(void) { + void *status; + return pthread_join(thread, &status); + } +}; +inline void ThreadExit(void *status) { + pthread_exit(status); +} +} // namespace utils +} // namespace mshadow +#define MSHADOW_THREAD_PREFIX void * +#endif // Linux +#endif // MSHADOW_UTILS_THREAD_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 2f032ba66489..ef354f76a8b6 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -362,7 +362,7 @@ struct Tensor: // constructor MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} MSHADOW_XINLINE Tensor(const Shape<1> &shape) - : shape_(shape), stream(NULL) {} + : shape_(shape), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, index_t stride) From 2a6c2eb1d546677ae6aed6288a458611dd3d0a77 Mon Sep 17 00:00:00 2001 From: winsty Date: Wed, 7 Jan 2015 04:07:56 +0800 Subject: [PATCH 078/147] add new example for stream --- example/Makefile | 8 ++++---- example/basic_stream.cu | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 example/basic_stream.cu diff --git a/example/Makefile b/example/Makefile index 37c300598dea..931b2fc39427 100644 --- a/example/Makefile +++ b/example/Makefile @@ -1,9 +1,9 @@ # set LD_LIBRARY_PATH export CC = gcc -export CXX = clang++ +export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -export LDFLAGS= -lm +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -I/opt/intel/mkl/include -I/usr/local/cuda-6.0/include/ -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64 -L/usr/local/cuda-6.0/lib64 +export LDFLAGS= -lm -lcurand -lcublas export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) # specify tensor path @@ -18,7 +18,7 @@ all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) basic: basic.cpp concat: concat.cpp defop: defop.cpp -basicx: basic.cu +basicx: basic_gpu.cu $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/example/basic_stream.cu b/example/basic_stream.cu new file mode 100644 index 000000000000..5687bdd5aae0 --- /dev/null +++ b/example/basic_stream.cu @@ -0,0 +1,34 @@ +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +int main(void) { + + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // create a 2 x 5 tensor, from existing space + Tensor ts1 = NewTensor(Shape2(2, 5), 0.0f); + Tensor ts2 = NewTensor(Shape2(2, 5), 0.0f); + ts1.stream_ = NewStream(); + ts2.stream_ = NewStream(); + ts1 = 1; // Should use stream 0. + ts2 = 2; // Should use stream 1. Can run in parallel with stream 0. + Tensor res = NewTensor(Shape2(2, 2), 0.0f); + res.stream_ = NewStream(); + res = dot(ts1, ts2.T()); //Should use stream 2. + + Tensor cpu_res = NewTensor(Shape2(2, 2), 0.0f); + Copy(cpu_res, res); // default stream, should be 0. + for (index_t i = 0; i < cpu_res.size(0); ++i){ + for (index_t j = 0; j < cpu_res.size(1); ++j){ + printf("%.2f ", cpu_res[i][j]); + } + printf("\n"); + } + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} From d3a03fe8ca3ebbdcba401cca4b8c18053296d4b5 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 04:29:01 -0800 Subject: [PATCH 079/147] ps in shape --- mshadow-ps/Makefile | 4 +- mshadow-ps/ps.h | 116 ++++++++++++--- mshadow-ps/ps_local-inl.h | 289 ++++++++++++++++++++++++++++++++++++ mshadow-ps/ps_mthread-inl.h | 13 -- mshadow-ps/thread.h | 6 +- mshadow-ps/thread_util.h | 141 ++++++++++++++++++ mshadow/tensor.h | 6 + mshadow/tensor_cpu-inl.h | 3 + mshadow/tensor_gpu-inl.h | 5 +- mshadow/utils.h | 2 +- 10 files changed, 544 insertions(+), 41 deletions(-) create mode 100644 mshadow-ps/ps_local-inl.h delete mode 100644 mshadow-ps/ps_mthread-inl.h create mode 100644 mshadow-ps/thread_util.h diff --git a/mshadow-ps/Makefile b/mshadow-ps/Makefile index 375c915cc933..ed2d466ff490 100644 --- a/mshadow-ps/Makefile +++ b/mshadow-ps/Makefile @@ -3,7 +3,7 @@ export CC = gcc export CXX = clang++ export NVCC =nvcc export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -export LDFLAGS= -lm +export LDFLAGS= -lm -lpthread export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) # specify tensor path @@ -15,7 +15,7 @@ CUBIN = all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) -test: test.cpp +test: test.cpp *.h $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 88210d7fe9f8..de711ee0aa07 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -9,6 +9,11 @@ */ #ifndef MSHADOW_PS_H_ #define MSHADOW_PS_H_ +#include +// optionally support of lambda function in C++11, if available +#if __cplusplus >= 201103L +#include +#endif // C++11 #include "../mshadow/tensor.h" namespace mshadow { @@ -21,6 +26,15 @@ namespace ps { template class IParamServer { + public: + /*! + * \brief callback function that will be executed when pull request finishes + * before calling the callback, the thread context is already switched + * to the device of pullrequest + * \param stream the stream of callback thread, it is recommended to operate using this stream + * \param arg the argument of callback function + */ + typedef void (CallbackFunction) (Stream *stream, void *arg); /*! \brief virtual destructor */ virtual ~IParamServer(void) {} /*! @@ -31,20 +45,26 @@ class IParamServer { virtual void SetParam(const char *name, const char *val) {} /*! * \brief initialize the paramerver server client - * \param num_device number of parallel device - * we want to support in current process - * in the future, the device id must be in [0, num_device) + * \param devices specifies the possible device id + * to be input from Push and Pull, */ - virtual void Init(int num_device = 1) {} + virtual void Init(const std::vector &devices) {} + /*! + * \brief initialize the paramerver server client + * without specifying the devices, only device 0 is allowed + */ + inline void Init(void) { + std::vector dev; + dev.push_back(0); + this->Init(dev); + } /*! * \brief wait until the pull event finishes - * - * \param devid the device id this tensor lies in * \param key the unique key to indicate the tensor * this is unique per device - * \param data the data + * \param devid the device id this tensor lies in */ - virtual void PullWait(int devid, int key) = 0; + virtual void PullWait(int key, int devid = 0) = 0; /*! * \brief push out a tensor to parameter server * this call is asynchronize and returns immediately @@ -53,11 +73,15 @@ class IParamServer { * \param key the unique key to indicate the tensor * this is unique per device * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be */ template - inline void Push(mshadow::Tensor data, - int key, int devid = 0) { - this->Push_(data.FlatTo2D(), key, devid); + inline void Push(Tensor data, + int key, + int devid = 0, + int priority = 0) { + this->Push_(data.FlatTo2D(), key, devid, priority); } /*! * \brief send a pull request, to pull parameter into data @@ -68,13 +92,32 @@ class IParamServer { * \param key the unique key to indicate the tensor, * this is unique per device * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + * \param callback the callback function that will + * be invoked when the request finishes + * \param callback_arg the argument to pass to callback */ template - inline void PullReq(mshadow::Tensor data, - int key, int devid = 0) { - this->PullReq_(data, key, devid); + inline void PullReq(Tensor data, + int key, + int devid = 0, + int priority = 0, + CallbackFunction callback = NULL, + void *callback_arg = NULL) { + this->PullReq_(data.FlatTo2D(), key, + devid, priority, callback); } - +#if __cplusplus >= 201103L + template + inline void PullReq(Tensor data, + int key, + int devid, + int priority, + std::function *stream)> callback) { + this->PullReq(data, key, devid, priority, InvokeLambda_, &callback); + } +#endif // C++11 protected: /*! * \brief push out a tensor to parameter server @@ -84,9 +127,13 @@ class IParamServer { * \param key the unique key to indicate the tensor * this is unique per device * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be */ - virtual void Push_(mshadow::Tensor data, - int key, int devid = 0) = 0; + virtual void Push_(Tensor data, + int key, + int devid = 0, + int priority = 0) = 0; /*! * \brief send a pull request, to pull parameter into data * this call is asynchronize and returns immediately @@ -96,16 +143,43 @@ class IParamServer { * \param key the unique key to indicate the tensor, * this is unique per device * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + * \param callback the callback function that will + * be invoked when the request finishes + * \param callback_arg the argument to pass to callback */ - virtual void PullReq_(mshadow::Tensor data, - int key, int devid = 0) = 0; + virtual void PullReq_(Tensor data, + int key, + int devid, + int priority, + CallbackFunction callback, + void *callback_arg) = 0; + private: +// C++11 support for lambda prepare function +#if __cplusplus >= 201103L + /*! \brief hack function to convert lambda to callback function */ + inline void InvokeLambda_(Stream *stream, void *fun) { + (*static_cast *stream)>*>(fun))(stream); + } +#endif // C++11 }; -/*! +} // namespace ps +} // namespace mshadow + +#include "./ps_local-inl.h" +namespace mshadow { +namespace ps { +/*! * \brief create a parameter server implementation * \param type the type of paramerver server */ template -inline IParamServer *Create(const char *type); +inline IParamServer *Create(const char *type) { + if (!strcmp("local", type)) return new LocalServer(); + utils::Error("unknown server type %s\n", type); + return NULL; +} } // namespace ps } // namespace mshadow #endif diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h new file mode 100644 index 000000000000..1f84e24be025 --- /dev/null +++ b/mshadow-ps/ps_local-inl.h @@ -0,0 +1,289 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps_local-inl.h + * \brief local multi-threading implementation of PS abstraction + * + * \author Tianqi Chen + */ +#ifndef MSHADOW_PS_LOCAL_INL_H_ +#define MSHADOW_PS_LOCAL_INL_H_ +#include +#include +#include "./thread.h" +#include "./thread_util.h" +#include "./ps.h" + +namespace mshadow { +namespace ps { +// multi-threaded implementation of +template +class LocalServer : public IParamServer { + public: + // redefine callback function + typedef typename IParamServer::CallbackFunction + CallbackFunction; + // destructor + virtual ~LocalServer(void) { + destroy_signal = true; + push_queue.Abort(1); + pull_queue.Abort(1); + thread_push_handler.Join(); + thread_pull_handler.Join(); + push_queue.Destroy(); + pull_map.Destroy(); + } + virtual void PullWait(int key, int devid) { + + } + virtual void Init(const std::vector &devices) { + utils::Check(devices.size() != 0, + "LocalServer.Init: must at least contain 1 devices"); + push_queue.Init(); + this->devices = devices; + // initialize device id to local index + dev2index.clear(); + for (size_t i = 0; i < devices.size(); ++i) { + int devid = devices[i]; + utils::Assert(devid >= 0, "device id must be bigger than 0"); + if (devid >= static_cast(dev2index.size())) { + dev2index.resize(devid + 1, -1); + } + dev2index[devid] = static_cast(i); + } + // initialize the thread + thread_push_handler.Start(PushHandlerThread, this); + } + protected: + virtual void Push_(Tensor data, + int key, int devid, int priority) { + this->InitPullMap(key); + push_queue.Push(PullTask(data, key, devid), priority); + } + virtual void PullReq_(Tensor data, + int key, int devid, int priority, + CallbackFunction callback, + void *callback_arg) { + PullEntry &e = pull_map.GetRef(key); + utils::Assert(e.req.size() == devices.size(), + "must initialize the key"); + const int wid = GetWorkIndex(devid); + PullReqRecord &r = e.req[wid]; + r.dest = data; + r.priority = priority; + r.callback = callback; + r.callback_arg = callback_arg; + request_lock.Lock(); + utils::Check(!r.pending, + "cannot send duplicate pull request before it finishes"); + if (e.ready) { + pull_queue.Push(std::make_pair(key, devid)); + } else { + r.pending = true; + } + request_lock.Unlock(); + } + /*! + * \brief called to notify that the data is ready for pull + * \param data the data that can be pulled back + * \param the key of the data + */ + virtual void PullReady(Tensor data, int key) { + PullEntry &e = pull_map.GetRef(key); + utils::Assert(e.req.size() == devices.size(), + "must initialize the key"); + request_lock.Lock(); + e.ready = true; + for (int i = 0; i < e.req.size(); ++i) { + if (e.req[i].pending) { + pull_queue.Push(std::make_pair(key, devices[i])); + e.req[i].pending = false; + } + } + request_lock.Unlock(); + } + /*! + * \brief event handler for push finish + * called when all the data with same key comes in + * \param data the buffer holds the data in all devices + * \param key the key of the data + */ + virtual void HandlePushFinish(Tensor data, int key) { + for (index_t i = 1; i < data.size(0); ++i) { + data[0] += data[i]; + } + this->PullReady(data[0], key); + } + + private: + /*! \brief task running */ + struct PullTask { + /*! \brief the task data source */ + Tensor data; + /*! \brief the key to the tensor */ + int key; + /*! + * \brief the device id, (key,devid), + * uniquely identifies a mem location + */ + int devid; + PullTask(void) {} + PullTask(Tensor data, int key, int devid) + : data(data), key(key), devid(devid) {} + }; + /*! \brief data structure to hold temporal push result */ + struct PushEntry { + // temporal space to hold input data + TensorContainer data; + // indicator whether the certain devices is already copied in + std::vector copied; + // number of data copied in + int num_copied; + // constructor + explicit PushEntry(int ndevice, Shape<2> shape) + : data(false) { + data.Resize(Shape3(ndevice, shape[0], shape[1])); + num_copied = 0; + copied.resize(ndevice, false); + } + }; + // a record to remember things related to pull request + struct PullReqRecord { + // whether this record contains a pending request + // waiting for pull ready + bool pending; + // the destination to pull data into + Tensor dest; + // the priority of the + int priority; + // callback function + CallbackFunction *callback; + // argument for callback + void *callback_arg; + PullReqRecord(void) : pending(false) { + } + }; + /*! \brief data structure to hold pull request */ + struct PullEntry { + // data to be pulled back + Tensor data; + // whether the data is ready + bool ready; + // pullrequest record + std::vector req; + // whether there is thread waiting on this event + std::vector wait; + PullEntry(void) + : ready(false) { + } + }; + // signal to notify all the thread about class destruction + bool destroy_signal; + // vector of devices + std::vector devices; + // device index to local index + std::vector dev2index; + //----- data structure used to support push ---- + // stream used by push thread each device for memcpy + std::vector*> push_stream; + // the queue used for push task + utils::ThreadPQueue push_queue; + // thread to handle push task + utils::Thread thread_push_handler; + // the map of push buffer + std::map push_buffer; + //----- data structure used to support pull ---- + // the queue used for pull task + utils::ThreadPQueue > pull_queue; + // stream used by pull thread each device for memcpy + std::vector*> pull_stream; + // the map to store pull status + utils::ThreadSafeMap pull_map; + // thread to handle pull task + utils::Thread thread_pull_handler; + // lock to lock request field + utils::Mutex request_lock; + // lock to lock wait field + utils::Mutex wait_lock; + // push handler + inline void PushHandler(void) { + // allocate stream resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + push_stream.push_back(NewStream()); + } + while (!destroy_signal) { + PullTask tsk; + if (push_queue.Pop(&tsk)) { + if (push_buffer.count(tsk.key) == 0) { + push_buffer[tsk.key] = new PushEntry(devices.size(), tsk.data.shape_); + } + const int wid = GetWorkIndex(tsk.devid); + PushEntry &e = *push_buffer[tsk.key]; + utils::Check(e.data[0].shape_ == tsk.data.shape_, + "Tensor with same key must share same shape"); + utils::Assert(!e.copied[wid], "data inconsistency"); + // start copy + SetDevice(tsk.devid); + Copy(e.data[wid], tsk.data, push_stream[wid]); + // mark copied + e.copied[wid] = true; + e.num_copied += 1; + if (e.num_copied >= static_cast(devices.size())) { + this->HandlePushFinish(e.data, tsk.key); + std::fill(e.copied.begin(), e.copied.end(), false); + e.num_copied = 0; + } + } else { + utils::Assert(destroy_signal, "abort but not destroy"); + } + } + // free resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + DeleteStream(push_stream[dev2index[devices[i]]]); + } + for (typename std::map::iterator + it = push_buffer.begin(); it != push_buffer.end(); ++it) { + delete it->second; + } + push_buffer.clear(); + } + /*!\brief entry point of loader thread */ + inline static MSHADOW_THREAD_PREFIX PushHandlerThread(void *pthread) { + static_cast(pthread)->PushHandler(); + utils::ThreadExit(NULL); + return NULL; + } + // get internal index of device + inline int GetWorkIndex(int devid) const { + utils::Check(devid >= 0 && + devid < static_cast(dev2index.size()) && + dev2index[devid] >= 0, + "Push: invalid devid"); + return dev2index[devid]; + } + // functions to handle pull + inline void InitPullMap(int key) { + pull_map.Init(key); + PullEntry &e = pull_map.GetRef(key); + if (e.req.size() == 0) { + request_lock.Lock(); + // must recheck after lock + if (e.req.size() == 0) { + e.req.resize(devices.size(), PullReqRecord()); + } + request_lock.Unlock(); + } + if (e.wait.size() == 0) { + wait_lock.Lock(); + // must recheck after lock + if (e.wait.size() == 0) { + e.wait.resize(devices.size(), false); + } + wait_lock.Unlock(); + } + } +}; +} // namespace ps +} // namespace mshadow +#endif // MSHADOW_PS_LOCAL_INL_H_ diff --git a/mshadow-ps/ps_mthread-inl.h b/mshadow-ps/ps_mthread-inl.h deleted file mode 100644 index 9e0ee1124c64..000000000000 --- a/mshadow-ps/ps_mthread-inl.h +++ /dev/null @@ -1,13 +0,0 @@ -/*! - * Copyright (c) 2014 by Contributors - * \file ps_mthread-inl.h - * \brief multi-threading implementation of PS abstraction - * - * \author Tianqi Chen - */ -#ifndef MSHADOW_PS_MTHREAD_INL_H_ -#define MSHADOW_PS_MTHREAD_INL_H_ -#include "./thread.h" -#include "./ps.h" - -#endif // MSHADOW_PS_MTHREAD_INL_H_ diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h index 67bf2a6fc92a..18d6891d542a 100644 --- a/mshadow-ps/thread.h +++ b/mshadow-ps/thread.h @@ -1,5 +1,5 @@ -#ifndef MSHADOW_UTILS_THREAD_H_ -#define MSHADOW_UTILS_THREAD_H_ +#ifndef MSHADOW_PS_THREAD_H_ +#define MSHADOW_PS_THREAD_H_ /*! * \file thread.h * \brief this header include the minimum necessary resource for multi-threading that can be compiled in windows, linux, mac @@ -161,4 +161,4 @@ inline void ThreadExit(void *status) { } // namespace mshadow #define MSHADOW_THREAD_PREFIX void * #endif // Linux -#endif // MSHADOW_UTILS_THREAD_H_ +#endif // MSHADOW_PS_THREAD_H_ diff --git a/mshadow-ps/thread_util.h b/mshadow-ps/thread_util.h new file mode 100644 index 000000000000..ced4a3e0bfef --- /dev/null +++ b/mshadow-ps/thread_util.h @@ -0,0 +1,141 @@ +#ifndef MSHADOW_PS_THREAD_UTIL_H_ +#define MSHADOW_PS_THREAD_UTIL_H_ +/*! + * \file thread_util.h + * \brief data structures for multi-threading communication + * \author Tianqi Chen + */ +#include +#include +#include "./thread.h" +namespace mshadow { +namespace utils { +/*! + * \brief thread safe queue that can be used for customer consumer model + * in the future, it will support priority scheduling + * \tparam DType the content of the queue + */ +template +class ThreadPQueue { + public: + /*! \brief intitialize the queue, must call this before use */ + inline void Init(void) { + lock_.Init(); + counter_.Init(0); + } + /*! \brief destroy the resources on the queue */ + inline void Destroy(void) { + lock_.Destroy(); + counter_.Destroy(); + } + /*! + * \brief Destroy the queue + * wake up all the threads waits on pop + * this is usually used in class destructor + * \param max_nthread the maximum number of thread that + * could be waiting on the queue + */ + inline void Abort(int max_nthread = 1) { + for (int i = 0; i < max_nthread; ++ i) { + counter_.Post(); + } + } + /*! + * \brief push an element to the queue + * \param data the data to be puhed into queue + * \param optionally priority level to hint which + * element should be poped first + */ + inline void Push(const DType &data, int priority = 0) { + lock_.Lock(); + queue_.push(Entry(data, priority)); + lock_.Unlock(); + counter_.Post(); + } + /*! + * \brief pop an element from the queue + * this will block the thread if the queue is empty + * \param data_out the address to put output of the queue + * \return true if a correct element is returned + * false if abort is called and no element was left in queue + */ + inline bool Pop(DType *data_out) { + counter_.Wait(); + lock_.Lock(); + if (queue_.size() == 0) { + lock_.Unlock(); return false; + } + *data_out = queue_.top().data; + queue_.pop(); + lock_.Unlock(); + return true; + } + + private: + // entry in the queue + struct Entry { + DType data; + int priority; + Entry(const DType &data, int priority) + : data(data), priority(priority) {} + inline bool operator<(const Entry &b) const { + return priority < b.priority; + } + }; + + // the queue to push + std::priority_queue queue_; + // lock for accessing the queue + utils::Mutex lock_; + // counter to count number of push tasks + utils::Semaphore counter_; +}; +// naive implementation of threadsafe map +template +class ThreadSafeMap { + public: + inline void Init(void) { + lock_.Init(); + } + inline void Destroy(void) { + for (typename std::map::iterator + it = map_.begin(); it != map_.end(); ++it) { + delete it->second; + } + lock_.Destroy(); + } + inline TValue *Get(int key) { + TValue *ret; + lock_.Lock(); + typename std::map::const_iterator + it = map_.find(key); + if (it == map_.end() || it->first != key) { + ret = NULL; + } else { + ret = it->second; + } + lock_.Unlock(); + return ret; + } + inline TValue &GetRef(int key) { + TValue *ret = this->Get(key); + utils::Assert(ret != NULL, "key does not exist"); + return *ret; + } + inline void Init(int key) { + lock_.Lock(); + if (map_.count(key) == 0) { + map_[key] = new TValue(); + } + lock_.Unlock(); + } + + private: + // lock for accessing the queue + utils::Mutex lock_; + std::map map_; +}; + +} // namespace utils +} // namespace mshadow +#endif // MSHADOW_PS_THREAD_UTIL_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 8eacb8f6f290..fc55acb3242f 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -422,6 +422,12 @@ inline void InitTensorEngine(int device_id = 0); * for using tensors in CPU, this call is actually not needed */ inline void ShutdownTensorEngine(void); +/*! + * \brief set the device of current thread to work on + * \param devid the device id + */ +template +inline void SetDevice(int devid); /*! * \brief create a new stream from system * \return a pointer to the created stream diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 33174171b718..ab5fb4f0a299 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -13,6 +13,9 @@ namespace mshadow { template<> +inline void SetDevice(int devid) { +} +template<> inline Stream *NewStream(void) { return new Stream(); } diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index ea8eaa4bd351..9c2eb48ccf52 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -46,7 +46,10 @@ inline void InitTensorEngine(int dev_id) { inline void ShutdownTensorEngine(void) { cublasShutdown(); } - +template<> +inline void SetDevice(int devid) { + utils::Check(cudaSetDevice(devid) == cudaSuccess, "cannot set device"); +} template inline void AllocSpace(Tensor *obj, bool pad) { size_t pitch; diff --git a/mshadow/utils.h b/mshadow/utils.h index 3da31a986c56..6003f5562814 100644 --- a/mshadow/utils.h +++ b/mshadow/utils.h @@ -42,7 +42,7 @@ void HandlePrint(const char *msg); #endif /*! \brief assert an condition is true, use this to handle debug information */ -inline void AssertX(bool exp, const char *fmt, ...) { +inline void Assert(bool exp, const char *fmt, ...) { if (!exp) { std::string msg(kPrintBuffer, '\0'); va_list args; From efe7f940552b33b8ba3ab718130782a85b804f73 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 05:45:54 -0800 Subject: [PATCH 080/147] put up everything together --- mshadow-ps/ps_local-inl.h | 137 ++++++++++++++++++++++++++++++++------ mshadow-ps/thread.h | 28 ++++++++ mshadow-ps/thread_util.h | 3 +- 3 files changed, 148 insertions(+), 20 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 1f84e24be025..0f90c88d0098 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -28,17 +28,35 @@ class LocalServer : public IParamServer { push_queue.Abort(1); pull_queue.Abort(1); thread_push_handler.Join(); - thread_pull_handler.Join(); + thread_pull_handler.Join(); push_queue.Destroy(); + pull_queue.Destroy(); pull_map.Destroy(); + request_lock.Destroy(); + wait_lock.Destroy(); + wait_cond.Destroy(); } virtual void PullWait(int key, int devid) { - + const int wid = GetWorkIndex(devid); + PullEntry &e = pull_map.GetRef(key); + // wake up waiters if any + utils::Assert(e.wait.size() == devices.size(), + "must initialize the key"); + PullWaitRecord &w = e.wait[wid]; + if (!w.finished) { + wait_lock.Lock(); + w.nwait += 1; + while (!w.finished) { + wait_cond.Wait(&wait_lock); + } + w.nwait -= 1; + utils::Assert(w.nwait >= 0, "boundary check"); + wait_lock.Unlock(); + } } virtual void Init(const std::vector &devices) { utils::Check(devices.size() != 0, "LocalServer.Init: must at least contain 1 devices"); - push_queue.Init(); this->devices = devices; // initialize device id to local index dev2index.clear(); @@ -50,13 +68,21 @@ class LocalServer : public IParamServer { } dev2index[devid] = static_cast(i); } + // initialize all the thread related things + push_queue.Init(); + pull_queue.Init(); + pull_map.Init(); + request_lock.Init(); + wait_lock.Init(); + wait_cond.Init(); // initialize the thread thread_push_handler.Start(PushHandlerThread, this); + thread_pull_handler.Start(PullHandlerThread, this); } protected: virtual void Push_(Tensor data, int key, int devid, int priority) { - this->InitPullMap(key); + this->InitPullMap(key, devid); push_queue.Push(PullTask(data, key, devid), priority); } virtual void PullReq_(Tensor data, @@ -66,12 +92,19 @@ class LocalServer : public IParamServer { PullEntry &e = pull_map.GetRef(key); utils::Assert(e.req.size() == devices.size(), "must initialize the key"); + utils::Assert(e.wait.size() == devices.size(), + "must initialize the key"); const int wid = GetWorkIndex(devid); PullReqRecord &r = e.req[wid]; r.dest = data; r.priority = priority; r.callback = callback; - r.callback_arg = callback_arg; + r.callback_arg = callback_arg; + // reset pull request finish mark + wait_lock.Lock(); + e.wait[wid].finished = false; + wait_lock.Unlock(); + // check ready event request_lock.Lock(); utils::Check(!r.pending, "cannot send duplicate pull request before it finishes"); @@ -162,16 +195,25 @@ class LocalServer : public IParamServer { PullReqRecord(void) : pending(false) { } }; + // a record to help handle pullwait + struct PullWaitRecord { + // number of thread that waits for the request to finish + int nwait; + // the request was finished + bool finished; + PullWaitRecord(void) : nwait(0), finished(false) { + } + }; /*! \brief data structure to hold pull request */ struct PullEntry { // data to be pulled back - Tensor data; + Tensor src; // whether the data is ready bool ready; // pullrequest record - std::vector req; + std::vector req; // whether there is thread waiting on this event - std::vector wait; + std::vector wait; PullEntry(void) : ready(false) { } @@ -203,7 +245,9 @@ class LocalServer : public IParamServer { // lock to lock request field utils::Mutex request_lock; // lock to lock wait field - utils::Mutex wait_lock; + utils::Mutex wait_lock; + // conditional variable to do waiting + utils::ConditionVariable wait_cond; // push handler inline void PushHandler(void) { // allocate stream resources @@ -240,7 +284,7 @@ class LocalServer : public IParamServer { // free resources for (size_t i = 0; i < devices.size(); ++i) { SetDevice(devices[i]); - DeleteStream(push_stream[dev2index[devices[i]]]); + DeleteStream(push_stream[i]); } for (typename std::map::iterator it = push_buffer.begin(); it != push_buffer.end(); ++it) { @@ -254,6 +298,61 @@ class LocalServer : public IParamServer { utils::ThreadExit(NULL); return NULL; } + + // push handler + inline void PullHandler(void) { + // allocate stream resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + pull_stream.push_back(NewStream()); + } + while (!destroy_signal) { + std::pair tsk; + if (pull_queue.Pop(&tsk)) { + const int key = tsk.first; + const int devid = tsk.second; + const int wid = GetWorkIndex(devid); + PullEntry &e = pull_map.GetRef(key); + { + // handle request + utils::Assert(e.req.size() == devices.size(), + "must initialize the key"); + PullReqRecord &r = e.req[wid]; + SetDevice(devid); + Copy(r.dest, e.src, pull_stream[wid]); + // callback, if any + if (r.callback != NULL) { + (*r.callback)(pull_stream[wid], r.callback_arg); + } + } + { + // wake up waiters if any + utils::Assert(e.wait.size() == devices.size(), + "must initialize the key"); + PullWaitRecord &w = e.wait[wid]; + wait_lock.Lock(); + w.finished = true; + if(w.nwait != 0) { + wait_cond.Broadcast(); + } + wait_lock.Unlock(); + } + } else { + utils::Assert(destroy_signal, "abort but not destroy"); + } + } + // free resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + DeleteStream(pull_stream[i]); + } + } + /*!\brief entry point of loader thread */ + inline static MSHADOW_THREAD_PREFIX PullHandlerThread(void *pthread) { + static_cast(pthread)->PullHandler(); + utils::ThreadExit(NULL); + return NULL; + } // get internal index of device inline int GetWorkIndex(int devid) const { utils::Check(devid >= 0 && @@ -263,24 +362,24 @@ class LocalServer : public IParamServer { return dev2index[devid]; } // functions to handle pull - inline void InitPullMap(int key) { + inline void InitPullMap(int key, int devid) { pull_map.Init(key); PullEntry &e = pull_map.GetRef(key); + request_lock.Lock(); + // must recheck after lock if (e.req.size() == 0) { - request_lock.Lock(); - // must recheck after lock - if (e.req.size() == 0) { - e.req.resize(devices.size(), PullReqRecord()); - } - request_lock.Unlock(); + e.req.resize(devices.size(), PullReqRecord()); } + e.ready = false; + request_lock.Unlock(); + // check wait map if (e.wait.size() == 0) { wait_lock.Lock(); // must recheck after lock if (e.wait.size() == 0) { - e.wait.resize(devices.size(), false); + e.wait.resize(devices.size(), PullWaitRecord()); } - wait_lock.Unlock(); + wait_lock.Unlock(); } } }; diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h index 18d6891d542a..395832c86b9f 100644 --- a/mshadow-ps/thread.h +++ b/mshadow-ps/thread.h @@ -134,10 +134,38 @@ class Mutex { inline void Destroy(void) { pthread_mutex_destroy(&mutex); } + private: + friend class ConditionVariable; pthread_mutex_t mutex; }; +// conditional variable that uses pthread +class ConditionVariable { + public: + // initialize conditional variable + inline void Init(void) { + pthread_cond_init(&cond, NULL); + } + // destroy the thread + inline void Destroy(void) { + pthread_cond_destroy(&cond); + } + // wait on the conditional variable + inline void Wait(Mutex *mutex) { + pthread_cond_wait(&cond, &(mutex->mutex)); + } + inline void Broadcast(void) { + pthread_cond_broadcast(&cond); + } + inline void Signal(void) { + pthread_cond_signal(&cond); + } + + private: + pthread_cond_t cond; +}; + /*!\brief simple thread class */ class Thread { private: diff --git a/mshadow-ps/thread_util.h b/mshadow-ps/thread_util.h index ced4a3e0bfef..660099c16df2 100644 --- a/mshadow-ps/thread_util.h +++ b/mshadow-ps/thread_util.h @@ -88,8 +88,9 @@ class ThreadPQueue { // lock for accessing the queue utils::Mutex lock_; // counter to count number of push tasks - utils::Semaphore counter_; + utils::Semaphore counter_; }; + // naive implementation of threadsafe map template class ThreadSafeMap { From 63eff679ce1c7da9df1d39e296d3c7ad133e881a Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 05:57:06 -0800 Subject: [PATCH 081/147] checkin all mshadow ps, not tested yet, need review --- mshadow-ps/ps.h | 11 +++++++++- mshadow-ps/ps_dist-inl.h | 44 +++++++++++++++++++++++++++++++++++++++ mshadow-ps/ps_local-inl.h | 2 +- 3 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 mshadow-ps/ps_dist-inl.h diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index de711ee0aa07..0bf8b0ed9bd5 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -16,6 +16,11 @@ #endif // C++11 #include "../mshadow/tensor.h" +/*! \brief whether to adapt distributed PS from parameter-server */ +#ifndef MSHADOW_DIST_PS_ +#define MSHADOW_DIST_PS_ 1 +#endif + namespace mshadow { namespace ps { /*! @@ -168,6 +173,7 @@ class IParamServer { } // namespace mshadow #include "./ps_local-inl.h" +#include "./ps_dist-inl.h" namespace mshadow { namespace ps { /*! @@ -176,7 +182,10 @@ namespace ps { */ template inline IParamServer *Create(const char *type) { - if (!strcmp("local", type)) return new LocalServer(); + if (!strcmp("local", type)) return new LocalServer(); +#if MSHADOW_DIST_PS_ + if (!strcmp("dist", type)) return new DistServer(); +#endif utils::Error("unknown server type %s\n", type); return NULL; } diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h new file mode 100644 index 000000000000..3081bd658268 --- /dev/null +++ b/mshadow-ps/ps_dist-inl.h @@ -0,0 +1,44 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps_local-inl.h + * \brief local multi-threading implementation of PS abstraction + * + * \author Tianqi Chen, Mu Li + */ +#ifndef MSHADOW_PS_DIST_INL_H_ +#define MSHADOW_PS_DIST_INL_H_ +#include "./ps_local-inl.h" + +namespace mshadow { +namespace ps { +#if MSHADOW_DIST_PS_ +template +class DistServer : public LocalServer { + public: + // parent type + typedef LocalServer Parent; + // initialize the parameter server + virtual void Init(const std::vector &devices) { + Parent::Init(devices); + } + virtual ~DistServer(void) { + } + // override this function, to use parameter server + virtual void HandlePushFinish(Tensor data, int key) { + for (index_t i = 1; i < data.size(0); ++i) { + data[0] += data[i]; + } + // something like + //auto callback = [&]() { + // receive data into dptr + // call pullready to notify the module + //this->PullReady(recvdata, key); + //} + // push(key, data[0].dptr_, data.MSize(), callback); + } +}; +#endif +} // namespace ps +} // namespace msahdow +#endif + diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 0f90c88d0098..b290be7493af 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -3,7 +3,7 @@ * \file ps_local-inl.h * \brief local multi-threading implementation of PS abstraction * - * \author Tianqi Chen + * \author Tianqi Chen, Mu Li */ #ifndef MSHADOW_PS_LOCAL_INL_H_ #define MSHADOW_PS_LOCAL_INL_H_ From 716fe6c47b1fd8d3726adaf5efae4650d42e4f31 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 07:01:58 -0800 Subject: [PATCH 082/147] checkin simple testcase --- mshadow-ps/Makefile | 2 +- mshadow-ps/ps.h | 4 ++-- mshadow-ps/ps_local-inl.h | 1 + mshadow-ps/test.cpp | 40 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 mshadow-ps/test.cpp diff --git a/mshadow-ps/Makefile b/mshadow-ps/Makefile index ed2d466ff490..bec318a84e8c 100644 --- a/mshadow-ps/Makefile +++ b/mshadow-ps/Makefile @@ -2,7 +2,7 @@ export CC = gcc export CXX = clang++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -std=c++11 export LDFLAGS= -lm -lpthread export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 0bf8b0ed9bd5..b4ce92f8ee01 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -111,7 +111,7 @@ class IParamServer { CallbackFunction callback = NULL, void *callback_arg = NULL) { this->PullReq_(data.FlatTo2D(), key, - devid, priority, callback); + devid, priority, callback, callback_arg); } #if __cplusplus >= 201103L template @@ -164,7 +164,7 @@ class IParamServer { // C++11 support for lambda prepare function #if __cplusplus >= 201103L /*! \brief hack function to convert lambda to callback function */ - inline void InvokeLambda_(Stream *stream, void *fun) { + inline static void InvokeLambda_(Stream *stream, void *fun) { (*static_cast *stream)>*>(fun))(stream); } #endif // C++11 diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index b290be7493af..71e88bb8ca89 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -126,6 +126,7 @@ class LocalServer : public IParamServer { "must initialize the key"); request_lock.Lock(); e.ready = true; + e.src = data; for (int i = 0; i < e.req.size(); ++i) { if (e.req[i].pending) { pull_queue.Push(std::make_pair(key, devices[i])); diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp new file mode 100644 index 000000000000..b7f6d0a8a1ce --- /dev/null +++ b/mshadow-ps/test.cpp @@ -0,0 +1,40 @@ +#include "./ps.h" +using namespace mshadow; +void Print1DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + printf("%.2f ", ts[i]); + } + printf("\n"); +} + +void Print2DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + Print1DTensor(ts[i]); + } +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage:\n"); return 0; + } + int ndev = atoi(argv[1]); + ps::IParamServer *ps = ps::Create("local"); + TensorContainer ts(Shape3(ndev,5,2)); + TensorContainer res(Shape3(ndev,5,2)); + std::vector devs; + for (int i = 0; i < ndev; ++i) { + devs.push_back(i); + ts[i] = 1.0 + i; + } + ps->Init(devs); + for (int i = 0; i < ndev; ++i) { + ps->Push(ts[i], 3, i); + ps->PullReq(res[i], 3, i); + } + for (int i = 0; i < ndev; ++i) { + ps->PullWait(3, i); + printf("----dev=%d----\n", i); + Print2DTensor(res[i]); + } + return 0; +} From 64469d5e44a043b165123c4e371c71e56b24fe28 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 07:17:39 -0800 Subject: [PATCH 083/147] more callback example --- mshadow-ps/ps.h | 9 +++++++-- mshadow-ps/test.cpp | 7 ++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index b4ce92f8ee01..3d039ad772ad 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -120,7 +120,10 @@ class IParamServer { int devid, int priority, std::function *stream)> callback) { - this->PullReq(data, key, devid, priority, InvokeLambda_, &callback); + // need to allocate space, because callback can happen latter.. + auto calbk = new std::function *stream)>(); + *calbk = callback; + this->PullReq(data, key, devid, priority, InvokeLambda_, calbk); } #endif // C++11 protected: @@ -165,7 +168,9 @@ class IParamServer { #if __cplusplus >= 201103L /*! \brief hack function to convert lambda to callback function */ inline static void InvokeLambda_(Stream *stream, void *fun) { - (*static_cast *stream)>*>(fun))(stream); + auto *fp = static_cast *stream)>*>(fun); + (*fp)(stream); + delete fp; } #endif // C++11 }; diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp index b7f6d0a8a1ce..2faca5548c68 100644 --- a/mshadow-ps/test.cpp +++ b/mshadow-ps/test.cpp @@ -29,7 +29,12 @@ int main(int argc, char *argv[]) { ps->Init(devs); for (int i = 0; i < ndev; ++i) { ps->Push(ts[i], 3, i); - ps->PullReq(res[i], 3, i); + int a = i; + ps->PullReq(res[i], 3, i, 0, [&](Stream *stream) { + printf("hello i=%d, a=%d,remember during callback, do not take local varaible.. \n", i, a); + ts += 1.0f; + } + ); } for (int i = 0; i < ndev; ++i) { ps->PullWait(3, i); From 99446f5448ec9a14346e55bf5877ca5fdcf05a47 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 07:41:52 -0800 Subject: [PATCH 084/147] add two wait for gpu --- mshadow-ps/ps_local-inl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 71e88bb8ca89..22dcaf25ab23 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -270,6 +270,8 @@ class LocalServer : public IParamServer { // start copy SetDevice(tsk.devid); Copy(e.data[wid], tsk.data, push_stream[wid]); + // wait till the copy finishes + push_stream[wid]->Wait(); // mark copied e.copied[wid] = true; e.num_copied += 1; @@ -325,6 +327,8 @@ class LocalServer : public IParamServer { if (r.callback != NULL) { (*r.callback)(pull_stream[wid], r.callback_arg); } + // wait till the operation finishes + pull_stream[wid]->Wait(); } { // wake up waiters if any From 7812e4a46cfe89cdfa5865c39f309516cb1e5509 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 15:35:11 -0800 Subject: [PATCH 085/147] fix stream when take operator[] --- mshadow/tensor.h | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index fc55acb3242f..6a923fe81409 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -270,8 +270,16 @@ struct Tensor: public TRValue, : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {} /*! \brief constructor from data pointer and shape */ MSHADOW_XINLINE Tensor(DType *dptr, - const Shape &shape, index_t stride) - : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} + const Shape &shape, + index_t stride, Stream *stream) + : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {} + /*! + * \brief set the stream to do computation of current tensor + * \param stream the computation stream + */ + inline void set_stream(Stream *stream) { + this->stream_ = stream; + } /*! * \return memory cost of the tensor, including the aligned x dimension * \tparam startdim the starting dimension @@ -304,7 +312,7 @@ struct Tensor: public TRValue, * \return tensor after flatten */ MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(dptr_, shape_.FlatTo2D(), stride_); + return Tensor(dptr_, shape_.FlatTo2D(), stride_, stream_); } /*! * \brief get a element of dimension - 1 @@ -313,7 +321,7 @@ struct Tensor: public TRValue, */ MSHADOW_XINLINE Tensor operator[](index_t idx) const { return Tensor(dptr_ + this->MemSize<1>() * idx, - shape_.SubShape(), stride_); + shape_.SubShape(), stride_, stream_); } /*! * \brief slice the tensor in highest dimension [begin,end) @@ -326,7 +334,7 @@ struct Tensor: public TRValue, Shape s = this->shape_; s[0] = end - begin; return Tensor(dptr_ + this->MemSize<1>() * begin, - s, stride_); + s, stride_, stream_); } /*!\brief implement the assignment of same type */ template @@ -365,15 +373,19 @@ struct Tensor: : shape_(shape), stream_(NULL) {} MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} - MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, index_t stride) - : dptr_(dptr), shape_(shape), stride_(stride), stream_(NULL) {} + MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, + index_t stride, Stream *stream) + : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {} + inline void set_stream(Stream *stream) { + this->stream_ = stream; + } MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(dptr_, shape_.FlatTo2D(), stride_); + return Tensor(dptr_, shape_.FlatTo2D(), stride_, stream_); } MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { Shape<1> s; s[0] = end - begin; - return Tensor(dptr_ + begin, s); + return Tensor(dptr_ + begin, s, s[0], stream_); } MSHADOW_XINLINE size_t MSize(void) const { return shape_[0]; From 4da8d2b136cdbf4503bd1e2c49c9a34e07bbc9a0 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 16:33:22 -0800 Subject: [PATCH 086/147] change stream to add force stream --- mshadow/base.h | 8 ++++++++ mshadow/random.h | 8 ++++---- mshadow/stream_gpu-inl.h | 9 +++++++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/mshadow/base.h b/mshadow/base.h index cbb94f3abae5..2c3ffc621f7d 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -42,6 +42,14 @@ #define MSHADOW_USE_CUDA 0 #endif +/*! + * \brief force user to use GPU stream during computation + * error will be shot when default stream NULL is used + */ +#ifndef MSHADOW_FORCE_STREAM +#define MSHADOW_FORCE_STREAM 0 +#endif + /*! \brief use CBLAS for CBLAS */ #ifndef MSHADOW_USE_CBLAS #define MSHADOW_USE_CBLAS 0 diff --git a/mshadow/random.h b/mshadow/random.h index 7a52a25d1ce6..dd6bd0bf2057 100644 --- a/mshadow/random.h +++ b/mshadow/random.h @@ -54,7 +54,7 @@ class Random { * \brief set the stream of computation * \param stream computation stream */ - inline void SetStream(Stream *stream) { + inline void set_stream(Stream *stream) { } /*! * \brief generate data from uniform [a,b) @@ -239,11 +239,11 @@ class Random { * \brief set the stream of computation * \param stream computation stream */ - inline void SetStream(Stream *stream) { + inline void set_stream(Stream *stream) { curandStatus_t status; - status = curandSetStream(gen_, Stream::GetStream(stream)); + status = curandset_stream(gen_, Stream::GetStream(stream)); utils::Check(status == CURAND_STATUS_SUCCESS, - "SetStream CURAND failed"); + "set_stream CURAND failed"); } /*! * \brief seed random number generator using this seed diff --git a/mshadow/stream_gpu-inl.h b/mshadow/stream_gpu-inl.h index e7d18aa42001..9862a4d6e241 100644 --- a/mshadow/stream_gpu-inl.h +++ b/mshadow/stream_gpu-inl.h @@ -41,8 +41,13 @@ struct Stream { * \brief returns actual cudaStream_t given an input GPU stream pointer * \param stream pointer to GPU stream */ - inline static cudaStream_t GetStream(Stream *stream) { - if (stream == NULL) return 0; + inline static cudaStream_t GetStream(Stream *stream) { + if (stream == NULL) { +#if MSHADOW_FORCE_STREAM + utils::Error("Default GPU stream was used when MSHADOW_FORCE_STREAM was on"); +#endif + return 0; + } else return stream->stream_; } }; From 1fed56731b81d967e3c02b86bc520c8b77796a18 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 7 Jan 2015 23:00:22 -0800 Subject: [PATCH 087/147] mshadow fix --- mshadow-ps/ps.h | 1 + mshadow-ps/ps_local-inl.h | 3 ++- mshadow-ps/test.cpp | 2 ++ mshadow/io.h | 6 ++++-- mshadow/stream_gpu-inl.h | 3 ++- 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 3d039ad772ad..e9b3311be4e8 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -65,6 +65,7 @@ class IParamServer { } /*! * \brief wait until the pull event finishes + * if there was no pull request, wait will directly returns * \param key the unique key to indicate the tensor * this is unique per device * \param devid the device id this tensor lies in diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 22dcaf25ab23..d2c83db2e6a5 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -202,7 +202,8 @@ class LocalServer : public IParamServer { int nwait; // the request was finished bool finished; - PullWaitRecord(void) : nwait(0), finished(false) { + PullWaitRecord(void) : nwait(0), finished(true) { + // set finished to true so pull without pull request returns } }; /*! \brief data structure to hold pull request */ diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp index 2faca5548c68..1906f87e4429 100644 --- a/mshadow-ps/test.cpp +++ b/mshadow-ps/test.cpp @@ -30,6 +30,7 @@ int main(int argc, char *argv[]) { for (int i = 0; i < ndev; ++i) { ps->Push(ts[i], 3, i); int a = i; + ps->PullWait(3, i); ps->PullReq(res[i], 3, i, 0, [&](Stream *stream) { printf("hello i=%d, a=%d,remember during callback, do not take local varaible.. \n", i, a); ts += 1.0f; @@ -37,6 +38,7 @@ int main(int argc, char *argv[]) { ); } for (int i = 0; i < ndev; ++i) { + ps->PullWait(3, i); ps->PullWait(3, i); printf("----dev=%d----\n", i); Print2DTensor(res[i]); diff --git a/mshadow/io.h b/mshadow/io.h index 04e1681766ac..32a3dd50842c 100644 --- a/mshadow/io.h +++ b/mshadow/io.h @@ -80,7 +80,8 @@ inline void SaveBinary(TStream &fo, const Tensor &src) { // copy to CPU, then save Tensor tmp(src.shape_); AllocSpace(&tmp); - Copy(tmp, src); + Stream stream; + Copy(tmp, src, &stream); SaveBinary(fo, tmp); FreeSpace(&tmp); } @@ -113,7 +114,8 @@ inline void LoadBinary(TStream &fi, } else { dst->shape = tmp.shape; AllocSpace(dst); } - Copy(*dst, tmp); + Stream stream; + Copy(*dst, tmp, &stream); FreeSpace(&tmp); } } // namespace mshadow diff --git a/mshadow/stream_gpu-inl.h b/mshadow/stream_gpu-inl.h index 9862a4d6e241..1cd2e971fc52 100644 --- a/mshadow/stream_gpu-inl.h +++ b/mshadow/stream_gpu-inl.h @@ -18,6 +18,7 @@ template<> struct Stream { /*! \brief cudaStream */ cudaStream_t stream_; + Stream(void) : stream_(0) {} /*! * \brief wait for all the computation associated * with this stream to complete @@ -41,7 +42,7 @@ struct Stream { * \brief returns actual cudaStream_t given an input GPU stream pointer * \param stream pointer to GPU stream */ - inline static cudaStream_t GetStream(Stream *stream) { + inline static cudaStream_t GetStream(Stream *stream) { if (stream == NULL) { #if MSHADOW_FORCE_STREAM utils::Error("Default GPU stream was used when MSHADOW_FORCE_STREAM was on"); From 9c718ba64c6d183c6c1803ca44108d9c5faf813e Mon Sep 17 00:00:00 2001 From: winsty Date: Fri, 9 Jan 2015 05:20:50 +0800 Subject: [PATCH 088/147] curand stream --- mshadow/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow/random.h b/mshadow/random.h index dd6bd0bf2057..d9e9d86acdcb 100644 --- a/mshadow/random.h +++ b/mshadow/random.h @@ -241,7 +241,7 @@ class Random { */ inline void set_stream(Stream *stream) { curandStatus_t status; - status = curandset_stream(gen_, Stream::GetStream(stream)); + status = curandSetStream(gen_, Stream::GetStream(stream)); utils::Check(status == CURAND_STATUS_SUCCESS, "set_stream CURAND failed"); } From 22bdd8ae234245b98bb8ab2785b879b727155704 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Thu, 8 Jan 2015 13:24:53 -0800 Subject: [PATCH 089/147] fix a warning --- mshadow-ps/ps_local-inl.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index d2c83db2e6a5..3300f7868520 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -15,7 +15,7 @@ namespace mshadow { namespace ps { -// multi-threaded implementation of +// multi-threaded implementation of template class LocalServer : public IParamServer { public: @@ -28,7 +28,7 @@ class LocalServer : public IParamServer { push_queue.Abort(1); pull_queue.Abort(1); thread_push_handler.Join(); - thread_pull_handler.Join(); + thread_pull_handler.Join(); push_queue.Destroy(); pull_queue.Destroy(); pull_map.Destroy(); @@ -82,7 +82,7 @@ class LocalServer : public IParamServer { protected: virtual void Push_(Tensor data, int key, int devid, int priority) { - this->InitPullMap(key, devid); + this->InitPullMap(key, devid); push_queue.Push(PullTask(data, key, devid), priority); } virtual void PullReq_(Tensor data, @@ -104,7 +104,7 @@ class LocalServer : public IParamServer { wait_lock.Lock(); e.wait[wid].finished = false; wait_lock.Unlock(); - // check ready event + // check ready event request_lock.Lock(); utils::Check(!r.pending, "cannot send duplicate pull request before it finishes"); @@ -127,7 +127,7 @@ class LocalServer : public IParamServer { request_lock.Lock(); e.ready = true; e.src = data; - for (int i = 0; i < e.req.size(); ++i) { + for (index_t i = 0; i < e.req.size(); ++i) { if (e.req[i].pending) { pull_queue.Push(std::make_pair(key, devices[i])); e.req[i].pending = false; @@ -140,16 +140,16 @@ class LocalServer : public IParamServer { * called when all the data with same key comes in * \param data the buffer holds the data in all devices * \param key the key of the data - */ + */ virtual void HandlePushFinish(Tensor data, int key) { for (index_t i = 1; i < data.size(0); ++i) { data[0] += data[i]; } this->PullReady(data[0], key); } - + private: - /*! \brief task running */ + /*! \brief task running */ struct PullTask { /*! \brief the task data source */ Tensor data; @@ -187,7 +187,7 @@ class LocalServer : public IParamServer { bool pending; // the destination to pull data into Tensor dest; - // the priority of the + // the priority of the int priority; // callback function CallbackFunction *callback; @@ -213,7 +213,7 @@ class LocalServer : public IParamServer { // whether the data is ready bool ready; // pullrequest record - std::vector req; + std::vector req; // whether there is thread waiting on this event std::vector wait; PullEntry(void) @@ -344,7 +344,7 @@ class LocalServer : public IParamServer { wait_lock.Unlock(); } } else { - utils::Assert(destroy_signal, "abort but not destroy"); + utils::Assert(destroy_signal, "abort but not destroy"); } } // free resources @@ -366,7 +366,7 @@ class LocalServer : public IParamServer { dev2index[devid] >= 0, "Push: invalid devid"); return dev2index[devid]; - } + } // functions to handle pull inline void InitPullMap(int key, int devid) { pull_map.Init(key); From d9d2591bd9874a9b28b4a5e422b090246477de9b Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 8 Jan 2015 19:16:39 -0800 Subject: [PATCH 090/147] fix ps --- mshadow-ps/ps_local-inl.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 3300f7868520..f27c72de5692 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -38,7 +38,9 @@ class LocalServer : public IParamServer { } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); - PullEntry &e = pull_map.GetRef(key); + PullEntry *p = pull_map.Get(key); + if (p == NULL) return; + PullEntry &e = *p; // wake up waiters if any utils::Assert(e.wait.size() == devices.size(), "must initialize the key"); @@ -107,7 +109,8 @@ class LocalServer : public IParamServer { // check ready event request_lock.Lock(); utils::Check(!r.pending, - "cannot send duplicate pull request before it finishes"); + "key = %d, cannot send duplicate pull request before it finishes", + key); if (e.ready) { pull_queue.Push(std::make_pair(key, devid)); } else { From 049d2759dc174fa4a940806f46273e3a3e0b92b0 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 8 Jan 2015 19:41:58 -0800 Subject: [PATCH 091/147] fix sumrows --- mshadow/cuda/reduce.cuh | 4 ---- mshadow/cuda/tensor_gpu-inl.cuh | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh index 3baaf7a57ed6..b90e9416b324 100644 --- a/mshadow/cuda/reduce.cuh +++ b/mshadow/cuda/reduce.cuh @@ -62,11 +62,7 @@ inline __device__ void ReduceX(volatile DType buf[], int tid) { // in warp optimization if (x_bits >= 5) { if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); -#if __CUDA_ARCH__ < 200 __syncthreads(); -#else - __MSHADOW_EMUSYNC__; -#endif } if (x_bits >= 4) { if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index e0b61ff22d30..2568bf71d787 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -19,9 +19,11 @@ namespace cuda { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 const int kMemUnitBits = 5; const int kMaxThreadsPerBlock = 1024; +#define MSHADOW_MEM_UNIT_BITS 5 #else const int kMemUnitBits = 4; const int kMaxThreadsPerBlock = 512; +#define MSHADOW_MEM_UNIT_BITS 4 #endif /*! \brief number of units that can do synchronized update, half warp size */ const int kMemUnit = 1 << kMemUnitBits; @@ -143,7 +145,7 @@ inline void MapReduceKeepLowest(expr::Plan dst, dim3 dimBlock(kMemUnit, kMemUnit); dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits); CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); - MapRedKeepLowestKernel, expr::Plan > <<>>(dst, plan, scale, eshape); From c96f8aaafe628dc14ca927896f4b739564740c92 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 9 Jan 2015 12:00:29 -0800 Subject: [PATCH 092/147] fix assert ps --- mshadow-ps/ps_dist-inl.h | 9 +++++--- mshadow-ps/ps_local-inl.h | 48 ++++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 3081bd658268..e181c8e88467 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -24,10 +24,13 @@ class DistServer : public LocalServer { virtual ~DistServer(void) { } // override this function, to use parameter server - virtual void HandlePushFinish(Tensor data, int key) { + virtual void HandlePushFinish(Tensor data, + Tensor result_buffer, + int key) { + Copy(result_buffer, data[0]); for (index_t i = 1; i < data.size(0); ++i) { - data[0] += data[i]; - } + result_buffer += data[i]; + } // something like //auto callback = [&]() { // receive data into dptr diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index f27c72de5692..d698a34ed04b 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -39,11 +39,11 @@ class LocalServer : public IParamServer { virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); PullEntry *p = pull_map.Get(key); - if (p == NULL) return; + if (p == NULL || p->wait.size() == 0) return; PullEntry &e = *p; // wake up waiters if any utils::Assert(e.wait.size() == devices.size(), - "must initialize the key"); + "PullWait: must initialize the wait"); PullWaitRecord &w = e.wait[wid]; if (!w.finished) { wait_lock.Lock(); @@ -93,9 +93,9 @@ class LocalServer : public IParamServer { void *callback_arg) { PullEntry &e = pull_map.GetRef(key); utils::Assert(e.req.size() == devices.size(), - "must initialize the key"); + "PullReq: must initialize the key, req"); utils::Assert(e.wait.size() == devices.size(), - "must initialize the key"); + "PullReq: must initialize the key, wait"); const int wid = GetWorkIndex(devid); PullReqRecord &r = e.req[wid]; r.dest = data; @@ -111,7 +111,7 @@ class LocalServer : public IParamServer { utils::Check(!r.pending, "key = %d, cannot send duplicate pull request before it finishes", key); - if (e.ready) { + if (e.req[wid].ready) { pull_queue.Push(std::make_pair(key, devid)); } else { r.pending = true; @@ -126,11 +126,11 @@ class LocalServer : public IParamServer { virtual void PullReady(Tensor data, int key) { PullEntry &e = pull_map.GetRef(key); utils::Assert(e.req.size() == devices.size(), - "must initialize the key"); + "PullReady: must initialize the key, req"); request_lock.Lock(); - e.ready = true; e.src = data; for (index_t i = 0; i < e.req.size(); ++i) { + e.req[i].ready = true; if (e.req[i].pending) { pull_queue.Push(std::make_pair(key, devices[i])); e.req[i].pending = false; @@ -140,15 +140,19 @@ class LocalServer : public IParamServer { } /*! * \brief event handler for push finish - * called when all the data with same key comes in + * called when all the data with same key comes int * \param data the buffer holds the data in all devices + * \param result_buffer temporal buffer to hold the reduction result * \param key the key of the data */ - virtual void HandlePushFinish(Tensor data, int key) { + virtual void HandlePushFinish(Tensor data, + Tensor result_buffer, + int key) { + Copy(result_buffer, data[0]); for (index_t i = 1; i < data.size(0); ++i) { - data[0] += data[i]; + result_buffer += data[i]; } - this->PullReady(data[0], key); + this->PullReady(result_buffer, key); } private: @@ -171,14 +175,17 @@ class LocalServer : public IParamServer { struct PushEntry { // temporal space to hold input data TensorContainer data; + // temporal space to hold to copy back + TensorContainer result_buffer; // indicator whether the certain devices is already copied in std::vector copied; // number of data copied in int num_copied; // constructor explicit PushEntry(int ndevice, Shape<2> shape) - : data(false) { + : data(false), result_buffer(false) { data.Resize(Shape3(ndevice, shape[0], shape[1])); + result_buffer.Resize(shape); num_copied = 0; copied.resize(ndevice, false); } @@ -186,6 +193,8 @@ class LocalServer : public IParamServer { // a record to remember things related to pull request struct PullReqRecord { // whether this record contains a pending request + // whether pull is ready to go + bool ready; // waiting for pull ready bool pending; // the destination to pull data into @@ -196,7 +205,7 @@ class LocalServer : public IParamServer { CallbackFunction *callback; // argument for callback void *callback_arg; - PullReqRecord(void) : pending(false) { + PullReqRecord(void) : ready(false), pending(false) { } }; // a record to help handle pullwait @@ -213,14 +222,11 @@ class LocalServer : public IParamServer { struct PullEntry { // data to be pulled back Tensor src; - // whether the data is ready - bool ready; // pullrequest record std::vector req; // whether there is thread waiting on this event std::vector wait; - PullEntry(void) - : ready(false) { + PullEntry(void) { } }; // signal to notify all the thread about class destruction @@ -280,7 +286,7 @@ class LocalServer : public IParamServer { e.copied[wid] = true; e.num_copied += 1; if (e.num_copied >= static_cast(devices.size())) { - this->HandlePushFinish(e.data, tsk.key); + this->HandlePushFinish(e.data, e.result_buffer, tsk.key); std::fill(e.copied.begin(), e.copied.end(), false); e.num_copied = 0; } @@ -323,7 +329,7 @@ class LocalServer : public IParamServer { { // handle request utils::Assert(e.req.size() == devices.size(), - "must initialize the key"); + "PullHandler: must initialize the key, req"); PullReqRecord &r = e.req[wid]; SetDevice(devid); Copy(r.dest, e.src, pull_stream[wid]); @@ -337,7 +343,7 @@ class LocalServer : public IParamServer { { // wake up waiters if any utils::Assert(e.wait.size() == devices.size(), - "must initialize the key"); + "PullHandler, must initialize the key, req"); PullWaitRecord &w = e.wait[wid]; wait_lock.Lock(); w.finished = true; @@ -379,8 +385,8 @@ class LocalServer : public IParamServer { if (e.req.size() == 0) { e.req.resize(devices.size(), PullReqRecord()); } - e.ready = false; request_lock.Unlock(); + e.req[GetWorkIndex(devid)].ready = false; // check wait map if (e.wait.size() == 0) { wait_lock.Lock(); From 3d952a11c8cf25f7e89f8a3e208401a10e8adf38 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 9 Jan 2015 12:44:05 -0800 Subject: [PATCH 093/147] fix load binary with force stream --- mshadow/tensor_container.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index e2aba78e4186..5d8aeef30a9e 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -105,7 +105,8 @@ class TensorContainer: public Tensor { Tensor tmp; mshadow::LoadBinary(fi, &tmp, false); this->Resize(tmp.shape_); - Copy(*this, tmp); + Stream stream; + Copy(*this, tmp, &stream); mshadow::FreeSpace(&tmp); } // functions to fit exp template From 381e79b6b227068d4ebe03fd137be43827a4fb48 Mon Sep 17 00:00:00 2001 From: winsty Date: Sat, 10 Jan 2015 06:06:49 +0800 Subject: [PATCH 094/147] fix cuda arch --- mshadow/cuda/reduce.cuh | 2 +- mshadow/cuda/tensor_gpu-inl.cuh | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh index b90e9416b324..3345b353fe7e 100644 --- a/mshadow/cuda/reduce.cuh +++ b/mshadow/cuda/reduce.cuh @@ -62,7 +62,7 @@ inline __device__ void ReduceX(volatile DType buf[], int tid) { // in warp optimization if (x_bits >= 5) { if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); - __syncthreads(); + __MSHADOW_EMUSYNC__; } if (x_bits >= 4) { if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 2568bf71d787..6d4b1543f3c8 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -12,18 +12,16 @@ namespace mshadow { namespace cuda { /*! \brief seems CUDAARCH is deprecated in future NVCC */ -#ifndef __CUDA_ARCH__ -#warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" +#ifndef MSHADOW_OLD_CUDA +#define MSHADOW_OLD_CUDA 0 #endif /* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */ -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 -const int kMemUnitBits = 5; -const int kMaxThreadsPerBlock = 1024; -#define MSHADOW_MEM_UNIT_BITS 5 -#else +#if MSHADOW_OLD_CUDA > 0 const int kMemUnitBits = 4; const int kMaxThreadsPerBlock = 512; -#define MSHADOW_MEM_UNIT_BITS 4 +#else +const int kMemUnitBits = 5; +const int kMaxThreadsPerBlock = 1024; #endif /*! \brief number of units that can do synchronized update, half warp size */ const int kMemUnit = 1 << kMemUnitBits; @@ -145,7 +143,7 @@ inline void MapReduceKeepLowest(expr::Plan dst, dim3 dimBlock(kMemUnit, kMemUnit); dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits); CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); - MapRedKeepLowestKernel, expr::Plan > <<>>(dst, plan, scale, eshape); From 4ed0e1873c8aa560cbc76fe3c42674b97993e866 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 9 Jan 2015 14:11:28 -0800 Subject: [PATCH 095/147] change macro --- mshadow/base.h | 9 +++++++++ mshadow/cuda/reduce.cuh | 4 ++++ mshadow/cuda/tensor_gpu-inl.cuh | 6 +----- mshadow/extension/concat.h | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/mshadow/base.h b/mshadow/base.h index 2c3ffc621f7d..b031859a6e78 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -65,6 +65,15 @@ #ifndef MSHADOW_USE_CUDA #define MSHADOW_USE_CUDA 1 #endif + +/*! + * \brief seems CUDAARCH is deprecated in future NVCC + * set this to 1 if you want to use CUDA version smaller than 2.0 + */ +#ifndef MSHADOW_OLD_CUDA +#define MSHADOW_OLD_CUDA 0 +#endif + /*! \brief use single precition float */ #ifndef MSHADOW_SINGLE_PRECISION #define MSHADOW_SINGLE_PRECISION 1 diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh index 3345b353fe7e..8fa0cf1dc061 100644 --- a/mshadow/cuda/reduce.cuh +++ b/mshadow/cuda/reduce.cuh @@ -62,7 +62,11 @@ inline __device__ void ReduceX(volatile DType buf[], int tid) { // in warp optimization if (x_bits >= 5) { if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); +#if MSHADOW_OLD_CUDA + __syncthreads(); +#else __MSHADOW_EMUSYNC__; +#endif } if (x_bits >= 4) { if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 6d4b1543f3c8..a65add5237a7 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -11,12 +11,8 @@ namespace mshadow { namespace cuda { -/*! \brief seems CUDAARCH is deprecated in future NVCC */ -#ifndef MSHADOW_OLD_CUDA -#define MSHADOW_OLD_CUDA 0 -#endif /* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */ -#if MSHADOW_OLD_CUDA > 0 +#if MSHADOW_OLD_CUDA const int kMemUnitBits = 4; const int kMaxThreadsPerBlock = 512; #else diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h index 3aa1e3123182..e7ae27735a0f 100644 --- a/mshadow/extension/concat.h +++ b/mshadow/extension/concat.h @@ -1,7 +1,7 @@ #ifndef MSHADOW_EXTENSION_CONCAT_H_ #define MSHADOW_EXTENSION_CONCAT_H_ -#include "mshadow/extension.h" +#include "../extension.h" namespace mshadow { namespace expr { From fb868acecb35c4376f32fdbf2cbb81ee38e2c5fb Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 11 Jan 2015 16:18:48 -0800 Subject: [PATCH 096/147] refactor ps to allow versioning --- mshadow-ps/ps_dist-inl.h | 4 +--- mshadow-ps/ps_local-inl.h | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index e181c8e88467..c88d5616b14c 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -25,11 +25,9 @@ class DistServer : public LocalServer { } // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, - Tensor result_buffer, int key) { - Copy(result_buffer, data[0]); for (index_t i = 1; i < data.size(0); ++i) { - result_buffer += data[i]; + data[0] += data[i]; } // something like //auto callback = [&]() { diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index d698a34ed04b..23193083f9b9 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -146,13 +146,11 @@ class LocalServer : public IParamServer { * \param key the key of the data */ virtual void HandlePushFinish(Tensor data, - Tensor result_buffer, int key) { - Copy(result_buffer, data[0]); for (index_t i = 1; i < data.size(0); ++i) { - result_buffer += data[i]; + data[0] += data[i]; } - this->PullReady(result_buffer, key); + this->PullReady(data[0], key); } private: @@ -174,18 +172,17 @@ class LocalServer : public IParamServer { /*! \brief data structure to hold temporal push result */ struct PushEntry { // temporal space to hold input data - TensorContainer data; - // temporal space to hold to copy back - TensorContainer result_buffer; + TensorContainer data; // indicator whether the certain devices is already copied in std::vector copied; // number of data copied in int num_copied; + // version number of data used to hold incomming data in push + int copyin_version; // constructor explicit PushEntry(int ndevice, Shape<2> shape) - : data(false), result_buffer(false) { - data.Resize(Shape3(ndevice, shape[0], shape[1])); - result_buffer.Resize(shape); + : data(false), copyin_version(0) { + data.Resize(Shape4(2, ndevice, shape[0], shape[1])); num_copied = 0; copied.resize(ndevice, false); } @@ -274,19 +271,21 @@ class LocalServer : public IParamServer { } const int wid = GetWorkIndex(tsk.devid); PushEntry &e = *push_buffer[tsk.key]; - utils::Check(e.data[0].shape_ == tsk.data.shape_, + utils::Check(e.data[0][0].shape_ == tsk.data.shape_, "Tensor with same key must share same shape"); utils::Assert(!e.copied[wid], "data inconsistency"); // start copy SetDevice(tsk.devid); - Copy(e.data[wid], tsk.data, push_stream[wid]); + Copy(e.data[e.copyin_version][wid], tsk.data, push_stream[wid]); // wait till the copy finishes push_stream[wid]->Wait(); // mark copied e.copied[wid] = true; e.num_copied += 1; if (e.num_copied >= static_cast(devices.size())) { - this->HandlePushFinish(e.data, e.result_buffer, tsk.key); + this->HandlePushFinish(e.data[e.copyin_version], tsk.key); + // switch version + e.copyin_version = (e.copyin_version + 1) % e.data.size(0); std::fill(e.copied.begin(), e.copied.end(), false); e.num_copied = 0; } From 744fa649ca37098274aee4a93d2a04a980843323 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 11 Jan 2015 16:31:47 -0800 Subject: [PATCH 097/147] support gather --- mshadow-ps/ps_local-inl.h | 46 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 23193083f9b9..6bcb2ef8503a 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -36,6 +36,18 @@ class LocalServer : public IParamServer { wait_lock.Destroy(); wait_cond.Destroy(); } + virtual void SetParam(const char *name, const char *val) { + int key; + if (sscanf(name, "push_op[%d]", &key) == 1) { + if (!strcmp(val, "gather")) { + push_operation[key] = kGather; return; + } + if (!strcmp(val, "sum")) { + push_operation[key] = kSum; return; + } + utils::Error("unknown push operation %s", val); + } + } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); PullEntry *p = pull_map.Get(key); @@ -82,6 +94,16 @@ class LocalServer : public IParamServer { thread_pull_handler.Start(PullHandlerThread, this); } protected: + /*! \brief operation performed locally in PS */ + enum LocalOp { + /*! \brief take sum of all devices over the same key */ + kSum = 0, + /*! + * \brief concatenate(gather), + * the tensors in all devices with same key + */ + kGather = 1 + }; virtual void Push_(Tensor data, int key, int devid, int priority) { this->InitPullMap(key, devid); @@ -147,10 +169,26 @@ class LocalServer : public IParamServer { */ virtual void HandlePushFinish(Tensor data, int key) { - for (index_t i = 1; i < data.size(0); ++i) { - data[0] += data[i]; + LocalOp op = kSum; + typename std::map::const_iterator + it = push_operation.find(key); + if (it != push_operation.end() && it->first == key) { + op = it->second; + } + switch (op) { + case kSum: { + for (index_t i = 1; i < data.size(0); ++i) { + data[0] += data[i]; + } + this->PullReady(data[0], key); + return; + } + case kGather: { + this->PullReady(data.FlatTo2D(), key); + return; + } + default: utils::Error("unknown LocalOp"); } - this->PullReady(data[0], key); } private: @@ -241,6 +279,8 @@ class LocalServer : public IParamServer { utils::Thread thread_push_handler; // the map of push buffer std::map push_buffer; + // customized local reduction operation + std::map push_operation; //----- data structure used to support pull ---- // the queue used for pull task utils::ThreadPQueue > pull_queue; From e8ea88f152cbaf54bb318a1b0caab951ac421cac Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 16:13:10 -0800 Subject: [PATCH 098/147] fix gather --- mshadow-ps/ps_local-inl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 6bcb2ef8503a..2f0513d08ee4 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -40,7 +40,10 @@ class LocalServer : public IParamServer { int key; if (sscanf(name, "push_op[%d]", &key) == 1) { if (!strcmp(val, "gather")) { - push_operation[key] = kGather; return; + request_lock.Lock(); + push_operation[key] = kGather; + request_lock.Unlock(); + return; } if (!strcmp(val, "sum")) { push_operation[key] = kSum; return; From c787135c8fe97c6aa1ed615bc018726d90d40182 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 16:13:41 -0800 Subject: [PATCH 099/147] ok --- mshadow-ps/ps_local-inl.h | 129 +++++++++++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 28 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 23193083f9b9..7acd1135792d 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -22,19 +22,32 @@ class LocalServer : public IParamServer { // redefine callback function typedef typename IParamServer::CallbackFunction CallbackFunction; + + LocalServer(void) { + init_end = 0; + perdev_pull_thread = 0; + } // destructor virtual ~LocalServer(void) { - destroy_signal = true; - push_queue.Abort(1); - pull_queue.Abort(1); - thread_push_handler.Join(); - thread_pull_handler.Join(); - push_queue.Destroy(); - pull_queue.Destroy(); - pull_map.Destroy(); - request_lock.Destroy(); - wait_lock.Destroy(); - wait_cond.Destroy(); + if (init_end != 0) { + destroy_signal = true; + push_queue.Abort(1); + for (size_t i = 0; i < pull_queues.size(); ++i) { + pull_queues[i].Abort(1); + } + thread_push_handler.Join(); + for (size_t i = 0; i < thread_pull_handler.size(); ++i) { + thread_pull_handler[i].Join(); + } + push_queue.Destroy(); + for (size_t i = 0; i < pull_queues.size(); ++i) { + pull_queues[i].Destroy(); + } + pull_map.Destroy(); + request_lock.Destroy(); + wait_lock.Destroy(); + wait_cond.Destroy(); + } } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); @@ -57,6 +70,8 @@ class LocalServer : public IParamServer { } } virtual void Init(const std::vector &devices) { + utils::Check(init_end == 0, + "LocalServer.Init can only call Init once"); utils::Check(devices.size() != 0, "LocalServer.Init: must at least contain 1 devices"); this->devices = devices; @@ -70,17 +85,40 @@ class LocalServer : public IParamServer { } dev2index[devid] = static_cast(i); } + // allocate space + pull_stream.resize(devices.size()); // initialize all the thread related things push_queue.Init(); - pull_queue.Init(); pull_map.Init(); request_lock.Init(); wait_lock.Init(); wait_cond.Init(); + if (perdev_pull_thread != 0) { + pull_queues.resize(devices.size()); + } else { + pull_queues.resize(1); + } + for (size_t i = 0; i < pull_queues.size(); ++i) { + pull_queues[i].Init(); + } // initialize the thread thread_push_handler.Start(PushHandlerThread, this); - thread_pull_handler.Start(PullHandlerThread, this); + // initialize pull handler + if (perdev_pull_thread != 0) { + thread_pull_handler.resize(devices.size()); + for (size_t i = 0; i < devices.size(); ++i) { + std::pair *p + = new std::pair(); + *p = std::make_pair(this, i); + thread_pull_handler[i].Start(PullGlobalThread, p); + } + } else { + thread_pull_handler.resize(1); + thread_pull_handler[0].Start(PullGlobalThread, this); + } + this->init_end = 0; } + protected: virtual void Push_(Tensor data, int key, int devid, int priority) { @@ -112,7 +150,11 @@ class LocalServer : public IParamServer { "key = %d, cannot send duplicate pull request before it finishes", key); if (e.req[wid].ready) { - pull_queue.Push(std::make_pair(key, devid)); + if (perdev_pull_thread != 0) { + pull_queues[wid].Push(std::make_pair(key, devid)); + } else { + pull_queues[0].Push(std::make_pair(key, devid)); + } } else { r.pending = true; } @@ -132,7 +174,11 @@ class LocalServer : public IParamServer { for (index_t i = 0; i < e.req.size(); ++i) { e.req[i].ready = true; if (e.req[i].pending) { - pull_queue.Push(std::make_pair(key, devices[i])); + if (perdev_pull_thread != 0) { + pull_queues[i].Push(std::make_pair(key, devices[i])); + } else { + pull_queues[0].Push(std::make_pair(key, devices[i])); + } e.req[i].pending = false; } } @@ -243,19 +289,23 @@ class LocalServer : public IParamServer { std::map push_buffer; //----- data structure used to support pull ---- // the queue used for pull task - utils::ThreadPQueue > pull_queue; + std::vector > > pull_queues; // stream used by pull thread each device for memcpy std::vector*> pull_stream; // the map to store pull status utils::ThreadSafeMap pull_map; // thread to handle pull task - utils::Thread thread_pull_handler; + std::vector thread_pull_handler; // lock to lock request field utils::Mutex request_lock; // lock to lock wait field utils::Mutex wait_lock; // conditional variable to do waiting utils::ConditionVariable wait_cond; + //---------configurations of server------- + int init_end; + // whether use pull thread per device + int perdev_pull_thread; // push handler inline void PushHandler(void) { // allocate stream resources @@ -311,16 +361,11 @@ class LocalServer : public IParamServer { return NULL; } - // push handler - inline void PullHandler(void) { - // allocate stream resources - for (size_t i = 0; i < devices.size(); ++i) { - SetDevice(devices[i]); - pull_stream.push_back(NewStream()); - } + // push handler procedure + inline void PullProc(utils::ThreadPQueue > *queue) { while (!destroy_signal) { std::pair tsk; - if (pull_queue.Pop(&tsk)) { + if (queue->Pop(&tsk)) { const int key = tsk.first; const int devid = tsk.second; const int wid = GetWorkIndex(devid); @@ -355,15 +400,43 @@ class LocalServer : public IParamServer { utils::Assert(destroy_signal, "abort but not destroy"); } } + } + // use one thread for all pull actions + inline void PullHandlerGlobal(void) { + // allocate stream resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + pull_stream[i] = NewStream(); + } + this->PullProc(&pull_queues[0]); // free resources for (size_t i = 0; i < devices.size(); ++i) { SetDevice(devices[i]); DeleteStream(pull_stream[i]); } } - /*!\brief entry point of loader thread */ - inline static MSHADOW_THREAD_PREFIX PullHandlerThread(void *pthread) { - static_cast(pthread)->PullHandler(); + inline void PullHandlerLocal(size_t tid) { + utils::Assert(tid < devices.size(), "threadid exceed boundary"); + utils::Assert(pull_queues.size() == devices.size(), + "must have one pull_queue per device"); + // allocate stream resources + SetDevice(devices[tid]); + pull_stream[tid] = NewStream(); + this->PullProc(&pull_queues[tid]); + SetDevice(devices[tid]); + DeleteStream(pull_stream[tid]); + } + /*!\brief entry point of pull thread, one thread for all devices */ + inline static MSHADOW_THREAD_PREFIX PullGlobalThread(void *arg) { + static_cast(arg)->PullHandlerGlobal(); + utils::ThreadExit(NULL); + return NULL; + } + inline static MSHADOW_THREAD_PREFIX PullLocalThread(void *arg) { + std::pair *p + = static_cast*>(arg); + p->first->PullHandlerLocal(p->second); + delete p; utils::ThreadExit(NULL); return NULL; } From 442539ded7ff8c90b442047a5f322b313235d63d Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 16:34:09 -0800 Subject: [PATCH 100/147] add per device pull thread --- mshadow-ps/ps_local-inl.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 9790c7e2169e..414b36af63c4 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -22,10 +22,10 @@ class LocalServer : public IParamServer { // redefine callback function typedef typename IParamServer::CallbackFunction CallbackFunction; - + // constructor LocalServer(void) { init_end = 0; - perdev_pull_thread = 0; + perdev_pull_thread = 1; } // destructor virtual ~LocalServer(void) { @@ -63,6 +63,16 @@ class LocalServer : public IParamServer { } utils::Error("unknown push operation %s", val); } + if (!strcmp(name, "pull_thread")) { + if (!strcmp(val, "ndev")) { + perdev_pull_thread = 1; + } else if (!strcmp(val, "one")) { + perdev_pull_thread = 0; + } else { + utils::Error("invalid value for parameter pull_thread,"\ + " can only be ndev or one"); + } + } } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); @@ -125,7 +135,7 @@ class LocalServer : public IParamServer { std::pair *p = new std::pair(); *p = std::make_pair(this, i); - thread_pull_handler[i].Start(PullGlobalThread, p); + thread_pull_handler[i].Start(PullLocalThread, p); } } else { thread_pull_handler.resize(1); @@ -403,7 +413,6 @@ class LocalServer : public IParamServer { utils::ThreadExit(NULL); return NULL; } - // push handler procedure inline void PullProc(utils::ThreadPQueue > *queue) { while (!destroy_signal) { From fac7c4f2717300ef91ed4aa94a935a053107927c Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 17:59:33 -0800 Subject: [PATCH 101/147] threadsafe push --- mshadow-ps/ps_local-inl.h | 47 +++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 414b36af63c4..51e00f2c1a7b 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -40,6 +40,8 @@ class LocalServer : public IParamServer { thread_pull_handler[i].Join(); } push_queue.Destroy(); + push_map.Destroy(); + push_lock.Destroy(); for (size_t i = 0; i < pull_queues.size(); ++i) { pull_queues[i].Destroy(); } @@ -114,6 +116,8 @@ class LocalServer : public IParamServer { pull_stream.resize(devices.size()); // initialize all the thread related things push_queue.Init(); + push_map.Init(); + push_lock.Init(); pull_map.Init(); request_lock.Init(); wait_lock.Init(); @@ -158,6 +162,7 @@ class LocalServer : public IParamServer { virtual void Push_(Tensor data, int key, int devid, int priority) { this->InitPullMap(key, devid); + this->InitPushMap(key, data.shape_); push_queue.Push(PullTask(data, key, devid), priority); } virtual void PullReq_(Tensor data, @@ -277,8 +282,10 @@ class LocalServer : public IParamServer { // version number of data used to hold incomming data in push int copyin_version; // constructor - explicit PushEntry(int ndevice, Shape<2> shape) - : data(false), copyin_version(0) { + PushEntry(void) + : data(false), copyin_version(0) {} + // constructor + inline void Init(int ndevice, Shape<2> shape) { data.Resize(Shape4(2, ndevice, shape[0], shape[1])); num_copied = 0; copied.resize(ndevice, false); @@ -336,8 +343,10 @@ class LocalServer : public IParamServer { utils::ThreadPQueue push_queue; // thread to handle push task utils::Thread thread_push_handler; + // lock to lock push field + utils::Mutex push_lock; // the map of push buffer - std::map push_buffer; + utils::ThreadSafeMap push_map; // customized local reduction operation std::map push_operation; //----- data structure used to support pull ---- @@ -369,11 +378,8 @@ class LocalServer : public IParamServer { while (!destroy_signal) { PullTask tsk; if (push_queue.Pop(&tsk)) { - if (push_buffer.count(tsk.key) == 0) { - push_buffer[tsk.key] = new PushEntry(devices.size(), tsk.data.shape_); - } const int wid = GetWorkIndex(tsk.devid); - PushEntry &e = *push_buffer[tsk.key]; + PushEntry &e = push_map.GetRef(tsk.key); utils::Check(e.data[0][0].shape_ == tsk.data.shape_, "Tensor with same key must share same shape"); utils::Assert(!e.copied[wid], "data inconsistency"); @@ -384,14 +390,20 @@ class LocalServer : public IParamServer { push_stream[wid]->Wait(); // mark copied e.copied[wid] = true; + push_lock.Lock(); e.num_copied += 1; - if (e.num_copied >= static_cast(devices.size())) { - this->HandlePushFinish(e.data[e.copyin_version], tsk.key); + int cp_version = e.copyin_version; + bool push_finish = e.num_copied >= static_cast(devices.size()); + if (push_finish) { // switch version e.copyin_version = (e.copyin_version + 1) % e.data.size(0); std::fill(e.copied.begin(), e.copied.end(), false); e.num_copied = 0; } + push_lock.Unlock(); + if (push_finish) { + this->HandlePushFinish(e.data[cp_version], tsk.key); + } } else { utils::Assert(destroy_signal, "abort but not destroy"); } @@ -401,11 +413,6 @@ class LocalServer : public IParamServer { SetDevice(devices[i]); DeleteStream(push_stream[i]); } - for (typename std::map::iterator - it = push_buffer.begin(); it != push_buffer.end(); ++it) { - delete it->second; - } - push_buffer.clear(); } /*!\brief entry point of loader thread */ inline static MSHADOW_THREAD_PREFIX PushHandlerThread(void *pthread) { @@ -521,6 +528,18 @@ class LocalServer : public IParamServer { wait_lock.Unlock(); } } + // functions to handle pull + inline void InitPushMap(int key, Shape<2> shape) { + push_map.Init(key); + PushEntry &e = push_map.GetRef(key); + if (e.copied.size() == 0) { + push_lock.Lock(); + if (e.copied.size() == 0) { + e.Init(devices.size(), shape); + } + push_lock.Unlock(); + } + } }; } // namespace ps } // namespace mshadow From b332d11ff50bcc89335b37f123731458957c5738 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 13 Jan 2015 19:42:36 -0800 Subject: [PATCH 102/147] add contiguous check --- mshadow/tensor.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 6a923fe81409..d54813a49b19 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -293,6 +293,13 @@ struct Tensor: public TRValue, } return memsz; } + /*! + * \return whether the tensor's memory is continuous + * x dimension same as stride + */ + MSHADOW_XINLINE bool CheckContiguous(void) const { + return this->shape_[dimension - 1] == stride_; + } /*! * \return memory cost of the tensor, including the aligned x dimension */ @@ -387,6 +394,9 @@ struct Tensor: s[0] = end - begin; return Tensor(dptr_ + begin, s, s[0], stream_); } + MSHADOW_XINLINE bool CheckContiguous(void) const { + return true; + } MSHADOW_XINLINE size_t MSize(void) const { return shape_[0]; } From e2f37798db09bcb423652858a9f32bbce2071643 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 13 Jan 2015 20:12:39 -0800 Subject: [PATCH 103/147] add perdev push thread, need review --- mshadow-ps/ps_local-inl.h | 101 +++++++++++++++++++++++++++++++------- 1 file changed, 83 insertions(+), 18 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 51e00f2c1a7b..68f264dcbad2 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -26,20 +26,27 @@ class LocalServer : public IParamServer { LocalServer(void) { init_end = 0; perdev_pull_thread = 1; + perdev_push_thread = 0; } // destructor virtual ~LocalServer(void) { if (init_end != 0) { destroy_signal = true; - push_queue.Abort(1); + for (size_t i = 0; i < push_queues.size(); ++i) { + push_queues[i].Abort(1); + } for (size_t i = 0; i < pull_queues.size(); ++i) { pull_queues[i].Abort(1); } - thread_push_handler.Join(); + for (size_t i = 0; i < thread_push_handler.size(); ++i) { + thread_push_handler[i].Join(); + } for (size_t i = 0; i < thread_pull_handler.size(); ++i) { thread_pull_handler[i].Join(); } - push_queue.Destroy(); + for (size_t i = 0; i < push_queues.size(); ++i) { + push_queues[i].Destroy(); + } push_map.Destroy(); push_lock.Destroy(); for (size_t i = 0; i < pull_queues.size(); ++i) { @@ -75,6 +82,16 @@ class LocalServer : public IParamServer { " can only be ndev or one"); } } + if (!strcmp(name, "push_thread")) { + if (!strcmp(val, "ndev")) { + perdev_push_thread = 1; + } else if (!strcmp(val, "one")) { + perdev_push_thread = 0; + } else { + utils::Error("invalid value for parameter push_thread,"\ + " can only be ndev or one"); + } + } } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); @@ -114,8 +131,16 @@ class LocalServer : public IParamServer { } // allocate space pull_stream.resize(devices.size()); + push_stream.resize(devices.size()); // initialize all the thread related things - push_queue.Init(); + if (perdev_push_thread != 0) { + push_queues.resize(devices.size()); + } else { + push_queues.resize(1); + } + for (size_t i = 0; i < push_queues.size(); ++i) { + push_queues[i].Init(); + } push_map.Init(); push_lock.Init(); pull_map.Init(); @@ -131,7 +156,18 @@ class LocalServer : public IParamServer { pull_queues[i].Init(); } // initialize the thread - thread_push_handler.Start(PushHandlerThread, this); + if (perdev_push_thread != 0) { + thread_push_handler.resize(devices.size()); + for (size_t i = 0; i < devices.size(); ++i) { + std::pair *p + = new std::pair(); + *p = std::make_pair(this, i); + thread_push_handler[i].Start(PushLocalThread, p); + } + } else { + thread_push_handler.resize(1); + thread_push_handler[0].Start(PushGlobalThread, this); + } // initialize pull handler if (perdev_pull_thread != 0) { thread_pull_handler.resize(devices.size()); @@ -163,7 +199,12 @@ class LocalServer : public IParamServer { int key, int devid, int priority) { this->InitPullMap(key, devid); this->InitPushMap(key, data.shape_); - push_queue.Push(PullTask(data, key, devid), priority); + if (perdev_push_thread != 0) { + int wid = GetWorkIndex(devid); + push_queues[wid].Push(PullTask(data, key, devid), priority); + } else { + push_queues[0].Push(PullTask(data, key, devid), priority); + } } virtual void PullReq_(Tensor data, int key, int devid, int priority, @@ -340,9 +381,9 @@ class LocalServer : public IParamServer { // stream used by push thread each device for memcpy std::vector*> push_stream; // the queue used for push task - utils::ThreadPQueue push_queue; + std::vector > push_queues; // thread to handle push task - utils::Thread thread_push_handler; + std::vector thread_push_handler; // lock to lock push field utils::Mutex push_lock; // the map of push buffer @@ -368,16 +409,13 @@ class LocalServer : public IParamServer { int init_end; // whether use pull thread per device int perdev_pull_thread; + // whether use push thread per device + int perdev_push_thread; // push handler - inline void PushHandler(void) { - // allocate stream resources - for (size_t i = 0; i < devices.size(); ++i) { - SetDevice(devices[i]); - push_stream.push_back(NewStream()); - } + inline void PushProc(utils::ThreadPQueue *queue) { while (!destroy_signal) { PullTask tsk; - if (push_queue.Pop(&tsk)) { + if (queue->Pop(&tsk)) { const int wid = GetWorkIndex(tsk.devid); PushEntry &e = push_map.GetRef(tsk.key); utils::Check(e.data[0][0].shape_ == tsk.data.shape_, @@ -408,15 +446,42 @@ class LocalServer : public IParamServer { utils::Assert(destroy_signal, "abort but not destroy"); } } + } + inline void PushHandlerGlobal(void) { + // allocate stream resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + push_stream[i] = NewStream(); + } + this->PushProc(&push_queues[0]); // free resources for (size_t i = 0; i < devices.size(); ++i) { SetDevice(devices[i]); DeleteStream(push_stream[i]); - } + } } + inline void PushHandlerLocal(size_t tid) { + utils::Assert(tid < devices.size(), "threadid exceed boundary"); + utils::Assert(push_queues.size() == devices.size(), + "must have one pull_queue per device"); + // allocate stream resources + SetDevice(devices[tid]); + push_stream[tid] = NewStream(); + this->PushProc(&push_queues[tid]); + SetDevice(devices[tid]); + DeleteStream(push_stream[tid]); + } /*!\brief entry point of loader thread */ - inline static MSHADOW_THREAD_PREFIX PushHandlerThread(void *pthread) { - static_cast(pthread)->PushHandler(); + inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) { + static_cast(pthread)->PushHandlerGlobal(); + utils::ThreadExit(NULL); + return NULL; + } + inline static MSHADOW_THREAD_PREFIX PushLocalThread(void *arg) { + std::pair *p + = static_cast*>(arg); + p->first->PushHandlerLocal(p->second); + delete p; utils::ThreadExit(NULL); return NULL; } From 2fdf83535b9c7738274f8700079fb29056c12dfc Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 13 Jan 2015 21:52:49 -0800 Subject: [PATCH 104/147] server ready --- mshadow-ps/ps_local-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 68f264dcbad2..b315c9de8b77 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -26,7 +26,7 @@ class LocalServer : public IParamServer { LocalServer(void) { init_end = 0; perdev_pull_thread = 1; - perdev_push_thread = 0; + perdev_push_thread = 1; } // destructor virtual ~LocalServer(void) { From 004a3431f9aadaf19aa682bcd47628b7f662d97d Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Fri, 16 Jan 2015 17:03:47 -0700 Subject: [PATCH 105/147] fix init problem --- mshadow-ps/ps_local-inl.h | 6 ++++-- mshadow-ps/thread.h | 17 +++++++++++++---- mshadow-ps/thread_util.h | 3 ++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index b315c9de8b77..09e5682aec28 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -27,6 +27,7 @@ class LocalServer : public IParamServer { init_end = 0; perdev_pull_thread = 1; perdev_push_thread = 1; + destroy_signal = false; } // destructor virtual ~LocalServer(void) { @@ -119,6 +120,7 @@ class LocalServer : public IParamServer { utils::Check(devices.size() != 0, "LocalServer.Init: must at least contain 1 devices"); this->devices = devices; + destroy_signal = false; // initialize device id to local index dev2index.clear(); for (size_t i = 0; i < devices.size(); ++i) { @@ -514,8 +516,8 @@ class LocalServer : public IParamServer { "PullHandler, must initialize the key, req"); PullWaitRecord &w = e.wait[wid]; wait_lock.Lock(); - w.finished = true; - if(w.nwait != 0) { + w.finished = true; + if (w.nwait != 0) { wait_cond.Broadcast(); } wait_lock.Unlock(); diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h index 395832c86b9f..7451f12052b0 100644 --- a/mshadow-ps/thread.h +++ b/mshadow-ps/thread.h @@ -55,6 +55,7 @@ inline void ThreadExit(void *status) { // thread interface using g++ #include #include +#include namespace mshadow { namespace utils { /*!\brief semaphore class */ @@ -105,16 +106,24 @@ class Semaphore { sem_t sem; public: inline void Init(int init_val) { - sem_init(&sem, 0, init_val); + if (sem_init(&sem, 0, init_val) != 0) { + utils::Error("Semaphore.Init:%s", strerror(errno)); + } } inline void Destroy(void) { - sem_destroy(&sem); + if (sem_destroy(&sem) != 0) { + utils::Error("Semaphore.Destroy:%s", strerror(errno)); + } } inline void Wait(void) { - sem_wait(&sem); + if (sem_wait(&sem) != 0) { + utils::Error("Semaphore.Wait:%s", strerror(errno)); + } } inline void Post(void) { - sem_post(&sem); + if (sem_post(&sem) != 0) { + utils::Error("Semaphore.Post:%s", strerror(errno)); + } } #endif }; diff --git a/mshadow-ps/thread_util.h b/mshadow-ps/thread_util.h index 660099c16df2..607d69f83c3a 100644 --- a/mshadow-ps/thread_util.h +++ b/mshadow-ps/thread_util.h @@ -50,7 +50,7 @@ class ThreadPQueue { lock_.Lock(); queue_.push(Entry(data, priority)); lock_.Unlock(); - counter_.Post(); + counter_.Post(); } /*! * \brief pop an element from the queue @@ -65,6 +65,7 @@ class ThreadPQueue { if (queue_.size() == 0) { lock_.Unlock(); return false; } + utils::Assert(queue_.size() != 0, "Queue.Pop"); *data_out = queue_.top().data; queue_.pop(); lock_.Unlock(); From a35cff8561130a7fe063154561e196fe07f21cd1 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Fri, 16 Jan 2015 18:13:44 -0700 Subject: [PATCH 106/147] ok --- mshadow-ps/Makefile | 9 +++++---- mshadow-ps/test.cu | 48 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 mshadow-ps/test.cu diff --git a/mshadow-ps/Makefile b/mshadow-ps/Makefile index bec318a84e8c..89682422d48d 100644 --- a/mshadow-ps/Makefile +++ b/mshadow-ps/Makefile @@ -1,8 +1,8 @@ # set LD_LIBRARY_PATH export CC = gcc -export CXX = clang++ +export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -std=c++11 +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ export LDFLAGS= -lm -lpthread export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) @@ -10,12 +10,13 @@ export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) BIN = test OBJ = CUOBJ = -CUBIN = +CUBIN = cutest .PHONY: clean all -all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) +all: $(BIN) $(OBJ) test: test.cpp *.h +cutest: test.cu *.h $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/mshadow-ps/test.cu b/mshadow-ps/test.cu new file mode 100644 index 000000000000..484793fccdf0 --- /dev/null +++ b/mshadow-ps/test.cu @@ -0,0 +1,48 @@ +#include "./ps.h" +using namespace mshadow; +void Print1DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + printf("%.2f ", ts[i]); + } + printf("\n"); +} + +void Print2DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + Print1DTensor(ts[i]); + } +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage:\n"); return 0; + } + int ndev = atoi(argv[1]); + ps::IParamServer *ps = ps::Create("local"); + TensorContainer ts(Shape3(ndev,5,2)); + TensorContainer res(Shape3(ndev,5,2)); + TensorContainer tscpu(Shape3(ndev,5,2)); + TensorContainer rescpu(Shape3(ndev,5,2)); + std::vector devs; + for (int i = 0; i < ndev; ++i) { + devs.push_back(i); + tscpu[i] = 1.0 + i; + } + mshadow::Copy(ts, tscpu); + ps->Init(devs); + for (int i = 0; i < ndev; ++i) { + ps->Push(ts[i], 3, i); + ps->PullWait(3, i); + ps->PullReq(res[i], 3, i, 0, + ); + } + for (int i = 0; i < ndev; ++i) { + ps->PullWait(3, i); + } + mshadow::Copy(rescpu, res); + for (int i = 0; i < ndev; ++i) { + printf("----dev=%d----\n", i); + Print2DTensor(rescpu[i]); + } + return 0; +} From 8c142036c0ce2ce6e795f3883e1a45fbbe8ed3af Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Fri, 16 Jan 2015 18:14:50 -0700 Subject: [PATCH 107/147] ok --- mshadow-ps/test.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mshadow-ps/test.cu b/mshadow-ps/test.cu index 484793fccdf0..883f4a256110 100644 --- a/mshadow-ps/test.cu +++ b/mshadow-ps/test.cu @@ -33,8 +33,7 @@ int main(int argc, char *argv[]) { for (int i = 0; i < ndev; ++i) { ps->Push(ts[i], 3, i); ps->PullWait(3, i); - ps->PullReq(res[i], 3, i, 0, - ); + ps->PullReq(res[i], 3, i, 0); } for (int i = 0; i < ndev; ++i) { ps->PullWait(3, i); From 4cbc0e9ed19b227df32b7c80eefb2b9648ee38d1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 17 Jan 2015 12:39:20 -0800 Subject: [PATCH 108/147] concept change of local ps --- mshadow-ps/ps_dist-inl.h | 59 ++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index c88d5616b14c..6a32c6c26c22 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -7,7 +7,9 @@ */ #ifndef MSHADOW_PS_DIST_INL_H_ #define MSHADOW_PS_DIST_INL_H_ +#include "./ps.h" #include "./ps_local-inl.h" +#include "./kv_array.h" namespace mshadow { namespace ps { @@ -17,26 +19,65 @@ class DistServer : public LocalServer { public: // parent type typedef LocalServer Parent; + virtual void SetParam(const char *name, const char *val) { + Parent::SetParam(name, val); + if (!strcmp(name, "name")) name_ = val; + if (!strcmp(name, "parent_name")) parent_name_ = val; + } // initialize the parameter server virtual void Init(const std::vector &devices) { Parent::Init(devices); + CHECK(!name_.empty()); + CHECK(!parent_name_.empty()); + shared_model_ = new PS::KVArray(name_, parent_name_); } - virtual ~DistServer(void) { + virtual ~DistServer(void) { } // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, int key) { + // here we only use sum reduction, can change to others for (index_t i = 1; i < data.size(0); ++i) { data[0] += data[i]; } - // something like - //auto callback = [&]() { - // receive data into dptr - // call pullready to notify the module - //this->PullReady(recvdata, key); - //} - // push(key, data[0].dptr_, data.MSize(), callback); - } + // things to send and recv + Tensor sendrecv = data[0]; + using namespace PS; + utils::Assert(data[0].CheckContiguous(), + "data must be contiguous"); + // TODO the zero copy version + // SArray val(data.dptr_, data.MSize(), false); + SArray val; val.copyFrom(sendrecv.dptr_, sendrecv.MSize()); + MessagePtr msg(new Message(kServerGroup)); + msg->addValue(val); + msg->task.set_key_channel(key); + Range(0, val.size()).to(msg->task.mutable_key_range()); + records_[key].push = CHECK_NOTNULL(shared_model_)->push(msg); + // setup callback + auto& rec = records_[key]; + MessagePtr msg(new Message(kServerGroup, -1, rec.push)); + msg->task.set_key_channel(key); + Range(0, sendrecv.MSize()).to(msg->task.mutable_key_range()); + + msg->fin_handle = [this, sendrecv, key]() { + const auto& recv = shared_model_->array(key); + CHECK_EQ(sendrecv.MSize(), recv.size()); + memcpy(CHECK_NOTNULL(sendrecv.dptr_), recv.data(), recv.size() * sizeof(DType)); + // call PullReady to notify LocalServer pulling is ready + this->PullReady(sendrecv, key); + }; + rec.pull = CHECK_NOTNULL(shared_model_)->pull(msg); + } + + private: + struct Record { + int push = -1; + int pull = -1; + DType* data = nullptr; + }; + std::string name_; + std::string parent_name_; + PS::KVArray* shared_model_; }; #endif } // namespace ps From e68e816ea997f30aaac8ea6a8fa5450acbea9848 Mon Sep 17 00:00:00 2001 From: muli Date: Sat, 17 Jan 2015 16:34:48 -0500 Subject: [PATCH 109/147] update ps_dist-inl.h --- mshadow-ps/kv_array.h | 108 +++++++++++++++++++++++++++++++++++++++ mshadow-ps/ps_dist-inl.h | 51 ++++++++---------- 2 files changed, 129 insertions(+), 30 deletions(-) create mode 100644 mshadow-ps/kv_array.h diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h new file mode 100644 index 000000000000..4eac0ca93e04 --- /dev/null +++ b/mshadow-ps/kv_array.h @@ -0,0 +1,108 @@ +#pragma once +#include "parameter/shared_parameter.h" +namespace PS { + +template +class KVArray : public SharedParameter { + public: + KVArray(const string& my_name, const string& parent_name) : + SharedParameter(my_name, parent_name) { } + virtual ~KVArray() { } + + void setArray(int key, V* data, size_t size) { + val_[key] = SArray(data, size, false); + } + // SArray& array(int key) { return val_[key]; } + + // funcs will be called by the system + MessagePtrList slice(const MessagePtr& msg, const KeyRangeList& krs); + void getValue(const MessagePtr& msg); + void setValue(const MessagePtr& msg); + protected: + std::unordered_map> val_; + // an array is place into multiple servers only if its length > min_slice_size + size_t min_slice_size_ = 1000; + private: +}; + + +template +void KVArray::setValue(const MessagePtr& msg) { + CHECK_EQ(msg->value.size(), 1); + SArray recv_data(msg->value[0]); + Range kr(msg->task.key_range()); + CHECK_EQ(kr.size(), recv_data.size()); + auto& my_val = val_[msg->task.key_channel()]; + + if (IamWorker()) { + if (my_val.empty()) my_val.resize(kr.size(), 0); + CHECK_GE(my_val.size(), kr.end()); + my_val.segment(kr).copyFrom(recv_data); + } else if (IamServer()) { + // TODO this server can do flexible consistency control here + + if (my_val.empty()) { + // TODO user-defined intiailizer + my_val.resize(kr.size(), 0); + } + + // TODO user-defined updater + CHECK_GE(my_val.size(), kr.end()); + my_val.segment(kr).eigenArray() += recv_data.eigenArray(); + } +} + +// only be called at servers, namely a worker pull data from this server +template +void KVArray::getValue(const MessagePtr& msg) { + auto& my_val = val_[msg->task.key_channel()]; + Range kr(msg->task.key_range()); + CHECK_GE(my_val.size(), kr.end()); + SArray send_data(kr.size()); + send_data.copyFrom(my_val.segment(kr)); + msg->addValue(send_data); +} + +// divide a message into n part, where part i goes to server i. it's a zero-copy +// implementation +template +MessagePtrList KVArray::slice(const MessagePtr& msg, const KeyRangeList& krs) { + // divide the key range + size_t n = krs.size(); + MessagePtrList ret(n); + Range kr(msg->task.key_range()); + for (int i = 0; i < n; ++i) { + ret[i] = MessagePtr(new Message()); + ret[i]->miniCopyFrom(*msg); + ret[i]->valid = true; + auto mut_kr = ret[i]->task.mutable_key_range(); + if (kr.size() < min_slice_size_) { + if (i == 0) { + // server 0 get all data + kr.to(mut_kr); + } else { + Range(0,0).to(mut_kr); + // do not sent to server 1 - n + ret[i]->valid = false; + } + } else { + kr.evenDivide(n, i).to(mut_kr); + } + } + + // divide the data + for (int i = 0; i < msg->value.size(); ++i) { + SArray data(msg->value[i]); + CHECK_EQ(data.size(), kr.size()); + for (int j = 0; j < n; ++j) { + if (ret[j]->valid) { + Range kr(ret[i]->task.key_range()); + ret[i]->addValue(data.segment(kr)); + } + } + } + return ret; +} + + +} // namespace PS diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 6a32c6c26c22..6f13cdf82244 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -23,15 +23,15 @@ class DistServer : public LocalServer { Parent::SetParam(name, val); if (!strcmp(name, "name")) name_ = val; if (!strcmp(name, "parent_name")) parent_name_ = val; - } + } // initialize the parameter server virtual void Init(const std::vector &devices) { Parent::Init(devices); CHECK(!name_.empty()); CHECK(!parent_name_.empty()); - shared_model_ = new PS::KVArray(name_, parent_name_); + shared_model_ = new PS::KVArray(name_, parent_name_); } - virtual ~DistServer(void) { + virtual ~DistServer(void) { } // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, @@ -40,47 +40,38 @@ class DistServer : public LocalServer { for (index_t i = 1; i < data.size(0); ++i) { data[0] += data[i]; } - // things to send and recv + + // push Tensor sendrecv = data[0]; using namespace PS; - utils::Assert(data[0].CheckContiguous(), - "data must be contiguous"); + utils::Assert(data[0].CheckContiguous(), "data must be contiguous"); // TODO the zero copy version // SArray val(data.dptr_, data.MSize(), false); SArray val; val.copyFrom(sendrecv.dptr_, sendrecv.MSize()); - MessagePtr msg(new Message(kServerGroup)); - msg->addValue(val); - msg->task.set_key_channel(key); - Range(0, val.size()).to(msg->task.mutable_key_range()); - records_[key].push = CHECK_NOTNULL(shared_model_)->push(msg); - // setup callback - auto& rec = records_[key]; - MessagePtr msg(new Message(kServerGroup, -1, rec.push)); - msg->task.set_key_channel(key); - Range(0, sendrecv.MSize()).to(msg->task.mutable_key_range()); - - msg->fin_handle = [this, sendrecv, key]() { - const auto& recv = shared_model_->array(key); - CHECK_EQ(sendrecv.MSize(), recv.size()); - memcpy(CHECK_NOTNULL(sendrecv.dptr_), recv.data(), recv.size() * sizeof(DType)); + MessagePtr push_msg(new Message(kServerGroup)); + push_msg->addValue(val); + push_msg->task.set_key_channel(key); + Range(0, val.size()).to(push_msg->task.mutable_key_range()); + int push_time = CHECK_NOTNULL(shared_model_)->push(push_msg); + + // pull + MessagePtr pull_msg(new Message(kServerGroup, -1, push_time)); + pull_msg->task.set_key_channel(key); + Range(0, sendrecv.MSize()).to(pull_msg->task.mutable_key_range()); + shared_model_->setArray(key, sendrecv.dptr_, sendrecv.MSize()); + pull_msg->fin_handle = [this, sendrecv, key]() { // call PullReady to notify LocalServer pulling is ready - this->PullReady(sendrecv, key); + this->PullReady(sendrecv, key); }; - rec.pull = CHECK_NOTNULL(shared_model_)->pull(msg); + shared_model_->pull(pull_msg); } private: - struct Record { - int push = -1; - int pull = -1; - DType* data = nullptr; - }; std::string name_; std::string parent_name_; - PS::KVArray* shared_model_; + PS::KVArray* shared_model_ = nullptr; }; #endif } // namespace ps } // namespace msahdow #endif - From 28ce2b0b1126f0256d52fe454e55f2fbf0f7a96d Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 17 Jan 2015 16:36:50 -0800 Subject: [PATCH 110/147] add pinned memory to host, add omp --- mshadow-ps/Makefile | 4 +-- mshadow-ps/ps_local-inl.h | 59 ++++++++++++++++++++++++++++++++++----- mshadow/tensor_cpu-inl.h | 44 +++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 9 deletions(-) diff --git a/mshadow-ps/Makefile b/mshadow-ps/Makefile index bec318a84e8c..6b3d787e99d4 100644 --- a/mshadow-ps/Makefile +++ b/mshadow-ps/Makefile @@ -1,8 +1,8 @@ # set LD_LIBRARY_PATH export CC = gcc -export CXX = clang++ +export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -std=c++11 +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -std=c++11 -fopenmp export LDFLAGS= -lm -lpthread export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index b315c9de8b77..438bfb8fa1f5 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -9,6 +9,15 @@ #define MSHADOW_PS_LOCAL_INL_H_ #include #include +#if defined(_OPENMP) +#include +#ifdef _MSC_VER +typedef int ms_omp_uint; +#else +typedef unsigned ms_omp_uint; +#endif +#endif + #include "./thread.h" #include "./thread_util.h" #include "./ps.h" @@ -27,6 +36,8 @@ class LocalServer : public IParamServer { init_end = 0; perdev_pull_thread = 1; perdev_push_thread = 1; + bigarray_bound = 1000 * 1000; + nthread_reduction = 8; } // destructor virtual ~LocalServer(void) { @@ -72,6 +83,12 @@ class LocalServer : public IParamServer { } utils::Error("unknown push operation %s", val); } + if (!strcmp(name, "reduce_thread")) { + nthread_reduction = atoi(val); + } + if (!strcmp(name, "bigarray_bound")) { + bigarray_bound = static_cast(atol(val)); + } if (!strcmp(name, "pull_thread")) { if (!strcmp(val, "ndev")) { perdev_pull_thread = 1; @@ -282,9 +299,7 @@ class LocalServer : public IParamServer { } switch (op) { case kSum: { - for (index_t i = 1; i < data.size(0); ++i) { - data[0] += data[i]; - } + this->ReduceSum(data); this->PullReady(data[0], key); return; } @@ -315,7 +330,7 @@ class LocalServer : public IParamServer { /*! \brief data structure to hold temporal push result */ struct PushEntry { // temporal space to hold input data - TensorContainer data; + Tensor data; // indicator whether the certain devices is already copied in std::vector copied; // number of data copied in @@ -324,13 +339,19 @@ class LocalServer : public IParamServer { int copyin_version; // constructor PushEntry(void) - : data(false), copyin_version(0) {} + : copyin_version(0) {} + ~PushEntry(void) { + if (data.dptr_ != NULL) { + mshadow::FreeHost(&data); + } + } // constructor inline void Init(int ndevice, Shape<2> shape) { - data.Resize(Shape4(2, ndevice, shape[0], shape[1])); + data.shape_ = Shape4(2, ndevice, shape[0], shape[1]); + mshadow::AllocHost(&data); num_copied = 0; copied.resize(ndevice, false); - } + } }; // a record to remember things related to pull request struct PullReqRecord { @@ -407,10 +428,34 @@ class LocalServer : public IParamServer { utils::ConditionVariable wait_cond; //---------configurations of server------- int init_end; + // number of reduction thread + int nthread_reduction; + // the threshold for big array + size_t bigarray_bound; // whether use pull thread per device int perdev_pull_thread; // whether use push thread per device int perdev_push_thread; + // perform sum reduction + inline void ReduceSum(Tensor data) { + #if defined(_OPENMP) + if (data[0].MSize() >= bigarray_bound && + nthread_reduction != 0) { + ms_omp_uint ntask = static_cast(data.size(1)); + #pragma omp parallel for schedule(static) num_threads(nthread_reduction) + for (ms_omp_uint j = 0; j < ntask; ++j) { + for (index_t i = 1; i < data.size(0); ++i) { + data[0][j] += data[i][j]; + } + } + } else + #endif + { + for (index_t i = 1; i < data.size(0); ++i) { + data[0] += data[i]; + } + } + } // push handler inline void PushProc(utils::ThreadPQueue *queue) { while (!destroy_signal) { diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index ab5fb4f0a299..c3ddbe107d3f 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -24,6 +24,50 @@ inline void DeleteStream(Stream *stream) { delete stream; } +template +inline void *AllocHost_(size_t size); +template +inline void FreeHost_(void * dptr); + +#ifdef __CUDACC__ +template<> +inline void *AllocHost_(size_t size) { + void *dptr; + utils::Check(cudaMallocHost(&dptr, size, + cudaHostAllocPortable) == cudaSuccess, + "AllocHost"); + return dptr; +} +template<> +inline void FreeHost_(void *dptr) { + cudaFreeHost(dptr); +} +#endif + +template<> +inline void *AllocHost_(size_t size) { + size_t pitch; + return sse2::AlignedMallocPitch(&pitch, size, 1); +} +template<> +inline void FreeHost_(void *dptr) { + sse2::AlignedFree(dptr); +} + +template +inline void AllocHost(Tensor *obj) { + obj->stride_ = obj->size(dim - 1); + utils::Assert(obj->CheckContiguous(), "AllocHost"); + void *dptr = AllocHost_(obj->MSize() * sizeof(DType)); + obj->dptr_ = reinterpret_cast(dptr); +} +template +inline void FreeHost(Tensor *obj) { + utils::Assert(obj->dptr_ != NULL, "FreeHost:: double free"); + FreeHost_(obj->dptr_); + obj->dptr_ = NULL; +} + template inline void AllocSpace(Tensor *obj, bool pad) { size_t pitch; From f1852835ca647b360b4e0d0567d6f5581e46846f Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 17 Jan 2015 16:39:52 -0800 Subject: [PATCH 111/147] quick merge --- mshadow-ps/Makefile | 4 ---- mshadow-ps/ps_dist-inl.h | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/mshadow-ps/Makefile b/mshadow-ps/Makefile index f3fec78ad5c3..108ff756ff51 100644 --- a/mshadow-ps/Makefile +++ b/mshadow-ps/Makefile @@ -2,11 +2,7 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -<<<<<<< HEAD export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -std=c++11 -fopenmp -======= -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ ->>>>>>> e68e816ea997f30aaac8ea6a8fa5450acbea9848 export LDFLAGS= -lm -lpthread export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 6f13cdf82244..d7b4b67ec489 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -9,8 +9,9 @@ #define MSHADOW_PS_DIST_INL_H_ #include "./ps.h" #include "./ps_local-inl.h" +#if MSHADOW_DIST_PS_ #include "./kv_array.h" - +#endif namespace mshadow { namespace ps { #if MSHADOW_DIST_PS_ From b7ff1fe9ff05ef78fa94a2fcf17d200f618d6732 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 17 Jan 2015 22:24:09 -0800 Subject: [PATCH 112/147] add option to disable pin ram --- mshadow-ps/ps_local-inl.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 86598ae880aa..2483176a9ef4 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -38,6 +38,7 @@ class LocalServer : public IParamServer { perdev_push_thread = 1; bigarray_bound = 1000 * 1000; nthread_reduction = 8; + use_pin_memory = 1; destroy_signal = false; } // destructor @@ -87,6 +88,9 @@ class LocalServer : public IParamServer { if (!strcmp(name, "reduce_thread")) { nthread_reduction = atoi(val); } + if (!strcmp(name, "use_pin_memory")) { + use_pin_memory = atoi(val); + } if (!strcmp(name, "bigarray_bound")) { bigarray_bound = static_cast(atol(val)); } @@ -339,18 +343,30 @@ class LocalServer : public IParamServer { int num_copied; // version number of data used to hold incomming data in push int copyin_version; + // use pinned memory + bool pin_memory; // constructor PushEntry(void) : copyin_version(0) {} ~PushEntry(void) { if (data.dptr_ != NULL) { - mshadow::FreeHost(&data); + if (pin_memory) { + mshadow::FreeHost(&data); + } else { + mshadow::FreeSpace(&data); + } } } // constructor - inline void Init(int ndevice, Shape<2> shape) { + inline void Init(int ndevice, Shape<2> shape, bool pin_memory) { + this->pin_memory = pin_memory; data.shape_ = Shape4(2, ndevice, shape[0], shape[1]); - mshadow::AllocHost(&data); + if (pin_memory) { + mshadow::AllocHost(&data); + } else { + mshadow::AllocSpace(&data, false); + } + utils::Assert(data.CheckContiguous(), "Init"); num_copied = 0; copied.resize(ndevice, false); } @@ -430,6 +446,8 @@ class LocalServer : public IParamServer { utils::ConditionVariable wait_cond; //---------configurations of server------- int init_end; + // use pinned memory + int use_pin_memory; // number of reduction thread int nthread_reduction; // the threshold for big array @@ -647,7 +665,7 @@ class LocalServer : public IParamServer { if (e.copied.size() == 0) { push_lock.Lock(); if (e.copied.size() == 0) { - e.Init(devices.size(), shape); + e.Init(devices.size(), shape, use_pin_memory != 0); } push_lock.Unlock(); } From 4cc2a9c164cfb7497d13114005c42913c5302a62 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 21 Jan 2015 20:59:33 -0800 Subject: [PATCH 113/147] add update on server --- mshadow-ps/ps.h | 60 +++++++++++++++++++++++++++++++ mshadow-ps/ps_dist-inl.h | 6 ++++ mshadow-ps/ps_local-inl.h | 76 ++++++++++++++++++++++++++++++++------- 3 files changed, 129 insertions(+), 13 deletions(-) diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index e9b3311be4e8..34da246825d7 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -63,6 +63,18 @@ class IParamServer { dev.push_back(0); this->Init(dev); } + /*! + * \brief initialize a key with certain shape + * \param shape the shape content of the key + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + template + inline void InitKey(Shape shape, + int key, int devid) { + this->InitKey_(shape.FlatTo2D(), key, devid); + } /*! * \brief wait until the pull event finishes * if there was no pull request, wait will directly returns @@ -128,6 +140,15 @@ class IParamServer { } #endif // C++11 protected: + /*! + * \brief initialize a key with certain shape + * \param shape the shape content of the key + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void InitKey_(Shape<2> shape, + int key, int devid) = 0; /*! * \brief push out a tensor to parameter server * this call is asynchronize and returns immediately @@ -164,6 +185,7 @@ class IParamServer { int priority, CallbackFunction callback, void *callback_arg) = 0; + private: // C++11 support for lambda prepare function #if __cplusplus >= 201103L @@ -175,6 +197,44 @@ class IParamServer { } #endif // C++11 }; +/*! \brief interface for customized mshadow server */ +template +class ICustomServer { + public: + virtual ~ICustomServer(void) {} + /*! + * \brief set parameters from outside + * \param name name of parameter + * \param val value of parameter + */ + virtual void SetParam(const char *name, const char *val) = 0; + /*! + * \brief init the server + * \param rank the rank of the node + * \param conf configuration + */ + virtual void Init(int rank, const std::string &conf) = 0; + /*! + * \brief initialize the key + * \param key the key of data we point to + * \param dptr the data pointer + * \param size size of the parameter key + */ + virtual void InitKey(int key, DType *dptr, size_t size) = 0; + /*! + * \param key the key of data we point to + * \param dptr the data pointer + * \param size size of the parameter key + */ + virtual void Update(int key, DType *dptr, size_t size) = 0; +}; +/*! + * \brief create customized server + * this is a server defined by user + * \return new server + */ +template +ICustomServer *CreateServer(void); } // namespace ps } // namespace mshadow diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index d7b4b67ec489..72e24df2b57b 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -31,9 +31,15 @@ class DistServer : public LocalServer { CHECK(!name_.empty()); CHECK(!parent_name_.empty()); shared_model_ = new PS::KVArray(name_, parent_name_); + if (this->custom_server != NULL) { + delete this->custom_server; + this->custom_server = NULL; + } } virtual ~DistServer(void) { } + // remove custom + virtual void ServerInitKey(Tensor weight, int key) {} // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, int key) { diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 2483176a9ef4..71ab395ff631 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -40,6 +40,7 @@ class LocalServer : public IParamServer { nthread_reduction = 8; use_pin_memory = 1; destroy_signal = false; + custom_server = NULL; } // destructor virtual ~LocalServer(void) { @@ -70,6 +71,7 @@ class LocalServer : public IParamServer { wait_lock.Destroy(); wait_cond.Destroy(); } + if (custom_server != NULL) delete custom_server; } virtual void SetParam(const char *name, const char *val) { int key; @@ -114,6 +116,9 @@ class LocalServer : public IParamServer { " can only be ndev or one"); } } + if (!strcmp(name, "update_on_server")) { + update_on_server = 1; + } } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); @@ -203,8 +208,11 @@ class LocalServer : public IParamServer { } else { thread_pull_handler.resize(1); thread_pull_handler[0].Start(PullGlobalThread, this); - } - this->init_end = 0; + } + if (update_on_server != 0) { + custom_server = CreateServer(); + } + this->init_end = 1; } protected: @@ -218,10 +226,16 @@ class LocalServer : public IParamServer { */ kGather = 1 }; + virtual void InitKey_(Shape<2> shape, + int key, int devid) { + this->InitPullMap(key); + this->InitPushMap(key, shape); + } + virtual void Push_(Tensor data, int key, int devid, int priority) { - this->InitPullMap(key, devid); - this->InitPushMap(key, data.shape_); + PullEntry &e = pull_map.GetRef(key); + e.req[GetWorkIndex(devid)].ready = false; if (perdev_push_thread != 0) { int wid = GetWorkIndex(devid); push_queues[wid].Push(PullTask(data, key, devid), priority); @@ -288,6 +302,13 @@ class LocalServer : public IParamServer { } request_lock.Unlock(); } + virtual void ServerInitKey(Tensor weight, int key) { + if (custom_server != NULL) { + // intialize server, and ready for pullback + custom_server->InitKey(key, weight.dptr_, weight.MSize()); + this->PullReady(weight, key); + } + } /*! * \brief event handler for push finish * called when all the data with same key comes int @@ -303,6 +324,14 @@ class LocalServer : public IParamServer { if (it != push_operation.end() && it->first == key) { op = it->second; } + // customized server + if (custom_server != NULL) { + this->ReduceSum(data); + custom_server->Update(key, data[0].dptr_, data[0].MSize()); + PushEntry &e = push_map.GetRef(key); + this->PullReady(e.weight, key); + return; + } switch (op) { case kSum: { this->ReduceSum(data); @@ -337,6 +366,8 @@ class LocalServer : public IParamServer { struct PushEntry { // temporal space to hold input data Tensor data; + // temporal space to hold weight, if needed + Tensor weight; // indicator whether the certain devices is already copied in std::vector copied; // number of data copied in @@ -347,26 +378,39 @@ class LocalServer : public IParamServer { bool pin_memory; // constructor PushEntry(void) - : copyin_version(0) {} + : copyin_version(0) { + weight.dptr_ = NULL; + } ~PushEntry(void) { if (data.dptr_ != NULL) { if (pin_memory) { mshadow::FreeHost(&data); + if (weight.dptr_ != NULL) { + mshadow::FreeHost(&weight); + } } else { mshadow::FreeSpace(&data); + if (weight.dptr_ != NULL) { + mshadow::FreeSpace(&weight); + } } } } // constructor - inline void Init(int ndevice, Shape<2> shape, bool pin_memory) { + inline void Init(int ndevice, Shape<2> shape, + bool pin_memory, bool need_weight) { this->pin_memory = pin_memory; data.shape_ = Shape4(2, ndevice, shape[0], shape[1]); + weight.shape_ = shape; if (pin_memory) { mshadow::AllocHost(&data); + if (need_weight) mshadow::AllocHost(&weight); } else { mshadow::AllocSpace(&data, false); + if (need_weight) mshadow::AllocSpace(&weight); } utils::Assert(data.CheckContiguous(), "Init"); + utils::Assert(!need_weight || weight.CheckContiguous(), "Init"); num_copied = 0; copied.resize(ndevice, false); } @@ -395,7 +439,8 @@ class LocalServer : public IParamServer { int nwait; // the request was finished bool finished; - PullWaitRecord(void) : nwait(0), finished(true) { + PullWaitRecord(void) + : nwait(0), finished(true) { // set finished to true so pull without pull request returns } }; @@ -443,9 +488,13 @@ class LocalServer : public IParamServer { // lock to lock wait field utils::Mutex wait_lock; // conditional variable to do waiting - utils::ConditionVariable wait_cond; - //---------configurations of server------- + utils::ConditionVariable wait_cond; + // customized server + ICustomServer *custom_server; + //---------configurations of server------- int init_end; + // whether perform update on serverside + int update_on_server; // use pinned memory int use_pin_memory; // number of reduction thread @@ -638,7 +687,7 @@ class LocalServer : public IParamServer { return dev2index[devid]; } // functions to handle pull - inline void InitPullMap(int key, int devid) { + inline void InitPullMap(int key) { pull_map.Init(key); PullEntry &e = pull_map.GetRef(key); request_lock.Lock(); @@ -647,13 +696,12 @@ class LocalServer : public IParamServer { e.req.resize(devices.size(), PullReqRecord()); } request_lock.Unlock(); - e.req[GetWorkIndex(devid)].ready = false; // check wait map if (e.wait.size() == 0) { wait_lock.Lock(); // must recheck after lock if (e.wait.size() == 0) { - e.wait.resize(devices.size(), PullWaitRecord()); + e.wait.resize(devices.size(), PullWaitRecord()); } wait_lock.Unlock(); } @@ -665,8 +713,10 @@ class LocalServer : public IParamServer { if (e.copied.size() == 0) { push_lock.Lock(); if (e.copied.size() == 0) { - e.Init(devices.size(), shape, use_pin_memory != 0); + e.Init(devices.size(), shape, + use_pin_memory != 0, update_on_server); } + this->ServerInitKey(e.weight, key); push_lock.Unlock(); } } From 0542af8fbf9f95c8db766568264a302df26707cd Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 22 Jan 2015 10:26:43 -0800 Subject: [PATCH 114/147] fix --- mshadow-ps/ps_local-inl.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 71ab395ff631..1c6624ec4f37 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -115,10 +115,12 @@ class LocalServer : public IParamServer { utils::Error("invalid value for parameter push_thread,"\ " can only be ndev or one"); } - } + } if (!strcmp(name, "update_on_server")) { - update_on_server = 1; + update_on_server = atoi(val); } + cfgvec.push_back(std::make_pair(std::string(name), + std::string(val))); } virtual void PullWait(int key, int devid) { const int wid = GetWorkIndex(devid); @@ -209,8 +211,13 @@ class LocalServer : public IParamServer { thread_pull_handler.resize(1); thread_pull_handler[0].Start(PullGlobalThread, this); } - if (update_on_server != 0) { + if (update_on_server != 0) { custom_server = CreateServer(); + for (size_t j = 0; j < cfgvec.size(); ++j) { + custom_server->SetParam(cfgvec[j].first.c_str(), + cfgvec[j].second.c_str()); + } + custom_server->Init(0, std::string()); } this->init_end = 1; } @@ -229,7 +236,7 @@ class LocalServer : public IParamServer { virtual void InitKey_(Shape<2> shape, int key, int devid) { this->InitPullMap(key); - this->InitPushMap(key, shape); + this->InitPushMap(key, shape); } virtual void Push_(Tensor data, @@ -505,6 +512,8 @@ class LocalServer : public IParamServer { int perdev_pull_thread; // whether use push thread per device int perdev_push_thread; + /*! \brief history of configurations */ + std::vector< std::pair > cfgvec; // perform sum reduction inline void ReduceSum(Tensor data) { #if defined(_OPENMP) From b75d01280f15e9d2b7628a2e23c16a3caff28b87 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 22 Jan 2015 12:09:57 -0800 Subject: [PATCH 115/147] fix mshadow ps on server --- mshadow-ps/ps_local-inl.h | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 1c6624ec4f37..9bcb942b4eab 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -235,8 +235,10 @@ class LocalServer : public IParamServer { }; virtual void InitKey_(Shape<2> shape, int key, int devid) { - this->InitPullMap(key); - this->InitPushMap(key, shape); + if (devid == devices[0]) { + this->InitPullMap(key); + this->InitPushMap(key, shape); + } } virtual void Push_(Tensor data, @@ -637,7 +639,7 @@ class LocalServer : public IParamServer { "PullHandler, must initialize the key, req"); PullWaitRecord &w = e.wait[wid]; wait_lock.Lock(); - w.finished = true; + w.finished = true; if (w.nwait != 0) { wait_cond.Broadcast(); } @@ -706,28 +708,24 @@ class LocalServer : public IParamServer { } request_lock.Unlock(); // check wait map + wait_lock.Lock(); + // must recheck after lock if (e.wait.size() == 0) { - wait_lock.Lock(); - // must recheck after lock - if (e.wait.size() == 0) { - e.wait.resize(devices.size(), PullWaitRecord()); - } - wait_lock.Unlock(); + e.wait.resize(devices.size(), PullWaitRecord()); } + wait_lock.Unlock(); } // functions to handle pull inline void InitPushMap(int key, Shape<2> shape) { push_map.Init(key); PushEntry &e = push_map.GetRef(key); + push_lock.Lock(); if (e.copied.size() == 0) { - push_lock.Lock(); - if (e.copied.size() == 0) { - e.Init(devices.size(), shape, - use_pin_memory != 0, update_on_server); - } - this->ServerInitKey(e.weight, key); - push_lock.Unlock(); + e.Init(devices.size(), shape, + use_pin_memory != 0, update_on_server != 0); } + this->ServerInitKey(e.weight, key); + push_lock.Unlock(); } }; } // namespace ps From 95ba26a35a02f3f4bc42d2fd66767132abd15ae9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 22 Jan 2015 12:35:51 -0800 Subject: [PATCH 116/147] add server --- mshadow-ps/ps_dist-inl.h | 35 +++++++++++++++++++++++++++++++---- mshadow-ps/ps_local-inl.h | 30 ++++++++++++++---------------- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 72e24df2b57b..0bcf55da810c 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -11,10 +11,8 @@ #include "./ps_local-inl.h" #if MSHADOW_DIST_PS_ #include "./kv_array.h" -#endif namespace mshadow { namespace ps { -#if MSHADOW_DIST_PS_ template class DistServer : public LocalServer { public: @@ -38,7 +36,7 @@ class DistServer : public LocalServer { } virtual ~DistServer(void) { } - // remove custom + // remove custom, leave it empty virtual void ServerInitKey(Tensor weight, int key) {} // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, @@ -78,7 +76,36 @@ class DistServer : public LocalServer { std::string parent_name_; PS::KVArray* shared_model_ = nullptr; }; -#endif + +template +class MShadowServer : public PS::App { + public: + MShadowerver(const std::string &conf) : App() { + server = CreateServer(); + server.Init(myRank(), conf); + } + virtual ~HelloServer() { + delete server; + } + void init() { + + } + void init_key(int key, DType *dptr, size_t size) { + server->InitKey(key, dptr, size); + auto callback = [server](DType *data, size_t sz) { + server->Update(key, data, sz); + }; + // register callback of update function + } + private: + // internal server + ICustomServer *server; +}; +// +// NOTE: do not add PS::CreateServer here +// add it in the program that uses mshadow-ps } // namespace ps } // namespace msahdow #endif +} // namespace PS +#endif diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 1c6624ec4f37..9bcb942b4eab 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -235,8 +235,10 @@ class LocalServer : public IParamServer { }; virtual void InitKey_(Shape<2> shape, int key, int devid) { - this->InitPullMap(key); - this->InitPushMap(key, shape); + if (devid == devices[0]) { + this->InitPullMap(key); + this->InitPushMap(key, shape); + } } virtual void Push_(Tensor data, @@ -637,7 +639,7 @@ class LocalServer : public IParamServer { "PullHandler, must initialize the key, req"); PullWaitRecord &w = e.wait[wid]; wait_lock.Lock(); - w.finished = true; + w.finished = true; if (w.nwait != 0) { wait_cond.Broadcast(); } @@ -706,28 +708,24 @@ class LocalServer : public IParamServer { } request_lock.Unlock(); // check wait map + wait_lock.Lock(); + // must recheck after lock if (e.wait.size() == 0) { - wait_lock.Lock(); - // must recheck after lock - if (e.wait.size() == 0) { - e.wait.resize(devices.size(), PullWaitRecord()); - } - wait_lock.Unlock(); + e.wait.resize(devices.size(), PullWaitRecord()); } + wait_lock.Unlock(); } // functions to handle pull inline void InitPushMap(int key, Shape<2> shape) { push_map.Init(key); PushEntry &e = push_map.GetRef(key); + push_lock.Lock(); if (e.copied.size() == 0) { - push_lock.Lock(); - if (e.copied.size() == 0) { - e.Init(devices.size(), shape, - use_pin_memory != 0, update_on_server); - } - this->ServerInitKey(e.weight, key); - push_lock.Unlock(); + e.Init(devices.size(), shape, + use_pin_memory != 0, update_on_server != 0); } + this->ServerInitKey(e.weight, key); + push_lock.Unlock(); } }; } // namespace ps From 1f6f298dcd37aa1aae3ed2c641794eff70df5e38 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 22 Jan 2015 12:40:41 -0800 Subject: [PATCH 117/147] Update ps_dist-inl.h --- mshadow-ps/ps_dist-inl.h | 1 - 1 file changed, 1 deletion(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 0bcf55da810c..0f07abeeae8f 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -107,5 +107,4 @@ class MShadowServer : public PS::App { } // namespace ps } // namespace msahdow #endif -} // namespace PS #endif From 29bb49c91d7e9e7463aa04b2a10b4902a583a1e5 Mon Sep 17 00:00:00 2001 From: muli Date: Thu, 22 Jan 2015 20:58:14 -0500 Subject: [PATCH 118/147] add updater in ps_dist --- mshadow-ps/kv_array.h | 34 ++++++++++++++++-------- mshadow-ps/ps_dist-inl.h | 54 +++++++++++++++++---------------------- mshadow-ps/ps_local-inl.h | 45 ++++++++++++++++---------------- 3 files changed, 70 insertions(+), 63 deletions(-) diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h index 4eac0ca93e04..8f4237b813e2 100644 --- a/mshadow-ps/kv_array.h +++ b/mshadow-ps/kv_array.h @@ -1,17 +1,25 @@ #pragma once #include "parameter/shared_parameter.h" +#include "ps.h" namespace PS { +DECLARE_string(app_name); + template class KVArray : public SharedParameter { public: - KVArray(const string& my_name, const string& parent_name) : + KVArray(const string& my_name = FLAGS_app_name, + const string& parent_name = FLAGS_app_name + "_model") : SharedParameter(my_name, parent_name) { } virtual ~KVArray() { } void setArray(int key, V* data, size_t size) { val_[key] = SArray(data, size, false); } + void setUpdater(ICustomServer* updater) { + updater_ = updater; + } + // SArray& array(int key) { return val_[key]; } // funcs will be called by the system @@ -22,6 +30,8 @@ class KVArray : public SharedParameter { std::unordered_map> val_; // an array is place into multiple servers only if its length > min_slice_size size_t min_slice_size_ = 1000; + + ICustomServer* updater_ = nullptr; private: }; @@ -32,23 +42,25 @@ void KVArray::setValue(const MessagePtr& msg) { SArray recv_data(msg->value[0]); Range kr(msg->task.key_range()); CHECK_EQ(kr.size(), recv_data.size()); - auto& my_val = val_[msg->task.key_channel()]; + int key = msg->task.key_channel(); + auto& my_val = val_[key]; - if (IamWorker()) { + if (isWorker()) { if (my_val.empty()) my_val.resize(kr.size(), 0); CHECK_GE(my_val.size(), kr.end()); my_val.segment(kr).copyFrom(recv_data); - } else if (IamServer()) { + } else if (isServer()) { // TODO this server can do flexible consistency control here if (my_val.empty()) { - // TODO user-defined intiailizer + // initialize weight my_val.resize(kr.size(), 0); + CHECK_NOTNULL(updater_)->InitKey(key, my_val.data(), my_val.size()); } - // TODO user-defined updater - CHECK_GE(my_val.size(), kr.end()); - my_val.segment(kr).eigenArray() += recv_data.eigenArray(); + // update weight + CHECK_GE(my_val.size(), kr.size()); + CHECK_NOTNULL(updater_)->Update(key, recv_data.data(), recv_data.size()); } } @@ -71,7 +83,7 @@ MessagePtrList KVArray::slice(const MessagePtr& msg, const KeyRangeList& krs) size_t n = krs.size(); MessagePtrList ret(n); Range kr(msg->task.key_range()); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { ret[i] = MessagePtr(new Message()); ret[i]->miniCopyFrom(*msg); ret[i]->valid = true; @@ -91,10 +103,10 @@ MessagePtrList KVArray::slice(const MessagePtr& msg, const KeyRangeList& krs) } // divide the data - for (int i = 0; i < msg->value.size(); ++i) { + for (size_t i = 0; i < msg->value.size(); ++i) { SArray data(msg->value[i]); CHECK_EQ(data.size(), kr.size()); - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { if (ret[j]->valid) { Range kr(ret[i]->task.key_range()); ret[i]->addValue(data.segment(kr)); diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 0f07abeeae8f..7a4c466170dd 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -9,8 +9,10 @@ #define MSHADOW_PS_DIST_INL_H_ #include "./ps.h" #include "./ps_local-inl.h" + #if MSHADOW_DIST_PS_ #include "./kv_array.h" +#include "system/app.h" namespace mshadow { namespace ps { template @@ -18,17 +20,15 @@ class DistServer : public LocalServer { public: // parent type typedef LocalServer Parent; - virtual void SetParam(const char *name, const char *val) { - Parent::SetParam(name, val); - if (!strcmp(name, "name")) name_ = val; - if (!strcmp(name, "parent_name")) parent_name_ = val; - } + + // virtual void SetParam(const char *name, const char *val) { + // Parent::SetParam(name, val); + // } + // initialize the parameter server virtual void Init(const std::vector &devices) { Parent::Init(devices); - CHECK(!name_.empty()); - CHECK(!parent_name_.empty()); - shared_model_ = new PS::KVArray(name_, parent_name_); + shared_model_ = new PS::KVArray(); if (this->custom_server != NULL) { delete this->custom_server; this->custom_server = NULL; @@ -36,6 +36,7 @@ class DistServer : public LocalServer { } virtual ~DistServer(void) { } + // remove custom, leave it empty virtual void ServerInitKey(Tensor weight, int key) {} // override this function, to use parameter server @@ -72,38 +73,31 @@ class DistServer : public LocalServer { } private: - std::string name_; - std::string parent_name_; PS::KVArray* shared_model_ = nullptr; }; template class MShadowServer : public PS::App { public: - MShadowerver(const std::string &conf) : App() { - server = CreateServer(); - server.Init(myRank(), conf); + // conf: get from the flag -app_conf + MShadowServer(const std::string &conf) : App() { + updater_ = CreateUpdater_(); + updater_->Init(myRank(), conf); + shared_model_ = new PS::KVArray(); + shared_model_->setUpdater(updater_); } - virtual ~HelloServer() { - delete server; - } - void init() { - - } - void init_key(int key, DType *dptr, size_t size) { - server->InitKey(key, dptr, size); - auto callback = [server](DType *data, size_t sz) { - server->Update(key, data, sz); - }; - // register callback of update function + virtual ~MShadowUpdater_() { + delete updater_; + delete shared_model_; } private: - // internal server - ICustomServer *server; + ICustomUpdater_ *updater_; + PS::KVArray* shared_model_; }; -// -// NOTE: do not add PS::CreateServer here -// add it in the program that uses mshadow-ps + +// NOTE: do not add PS::CreateServer here add it in the program that uses +// mshadow-ps + } // namespace ps } // namespace msahdow #endif diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index 9bcb942b4eab..d3618e343734 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -115,7 +115,7 @@ class LocalServer : public IParamServer { utils::Error("invalid value for parameter push_thread,"\ " can only be ndev or one"); } - } + } if (!strcmp(name, "update_on_server")) { update_on_server = atoi(val); } @@ -206,12 +206,12 @@ class LocalServer : public IParamServer { = new std::pair(); *p = std::make_pair(this, i); thread_pull_handler[i].Start(PullLocalThread, p); - } + } } else { - thread_pull_handler.resize(1); + thread_pull_handler.resize(1); thread_pull_handler[0].Start(PullGlobalThread, this); } - if (update_on_server != 0) { + if (update_on_server != 0) { custom_server = CreateServer(); for (size_t j = 0; j < cfgvec.size(); ++j) { custom_server->SetParam(cfgvec[j].first.c_str(), @@ -233,14 +233,14 @@ class LocalServer : public IParamServer { */ kGather = 1 }; - virtual void InitKey_(Shape<2> shape, + virtual void InitKey_(Shape<2> shape, int key, int devid) { - if (devid == devices[0]) { + if (devid == devices[0]) { this->InitPullMap(key); this->InitPushMap(key, shape); } } - + virtual void Push_(Tensor data, int key, int devid, int priority) { PullEntry &e = pull_map.GetRef(key); @@ -278,7 +278,7 @@ class LocalServer : public IParamServer { key); if (e.req[wid].ready) { if (perdev_pull_thread != 0) { - pull_queues[wid].Push(std::make_pair(key, devid)); + pull_queues[wid].Push(std::make_pair(key, devid)); } else { pull_queues[0].Push(std::make_pair(key, devid)); } @@ -349,12 +349,15 @@ class LocalServer : public IParamServer { } case kGather: { this->PullReady(data.FlatTo2D(), key); - return; + return; } default: utils::Error("unknown LocalOp"); } } + protected: + // customized server + ICustomServer *custom_server; private: /*! \brief task running */ struct PullTask { @@ -396,7 +399,7 @@ class LocalServer : public IParamServer { mshadow::FreeHost(&data); if (weight.dptr_ != NULL) { mshadow::FreeHost(&weight); - } + } } else { mshadow::FreeSpace(&data); if (weight.dptr_ != NULL) { @@ -422,7 +425,7 @@ class LocalServer : public IParamServer { utils::Assert(!need_weight || weight.CheckContiguous(), "Init"); num_copied = 0; copied.resize(ndevice, false); - } + } }; // a record to remember things related to pull request struct PullReqRecord { @@ -497,10 +500,8 @@ class LocalServer : public IParamServer { // lock to lock wait field utils::Mutex wait_lock; // conditional variable to do waiting - utils::ConditionVariable wait_cond; - // customized server - ICustomServer *custom_server; - //---------configurations of server------- + utils::ConditionVariable wait_cond; + //---------configurations of server------- int init_end; // whether perform update on serverside int update_on_server; @@ -521,21 +522,21 @@ class LocalServer : public IParamServer { #if defined(_OPENMP) if (data[0].MSize() >= bigarray_bound && nthread_reduction != 0) { - ms_omp_uint ntask = static_cast(data.size(1)); - #pragma omp parallel for schedule(static) num_threads(nthread_reduction) + ms_omp_uint ntask = static_cast(data.size(1)); + #pragma omp parallel for schedule(static) num_threads(nthread_reduction) for (ms_omp_uint j = 0; j < ntask; ++j) { for (index_t i = 1; i < data.size(0); ++i) { data[0][j] += data[i][j]; } } - } else - #endif + } else + #endif { for (index_t i = 1; i < data.size(0); ++i) { data[0] += data[i]; } } - } + } // push handler inline void PushProc(utils::ThreadPQueue *queue) { while (!destroy_signal) { @@ -583,7 +584,7 @@ class LocalServer : public IParamServer { for (size_t i = 0; i < devices.size(); ++i) { SetDevice(devices[i]); DeleteStream(push_stream[i]); - } + } } inline void PushHandlerLocal(size_t tid) { utils::Assert(tid < devices.size(), "threadid exceed boundary"); @@ -595,7 +596,7 @@ class LocalServer : public IParamServer { this->PushProc(&push_queues[tid]); SetDevice(devices[tid]); DeleteStream(push_stream[tid]); - } + } /*!\brief entry point of loader thread */ inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) { static_cast(pthread)->PushHandlerGlobal(); From 6aa741959d4447d50da4403413bb97e165310dcd Mon Sep 17 00:00:00 2001 From: muli Date: Thu, 22 Jan 2015 21:03:25 -0500 Subject: [PATCH 119/147] fix minor --- mshadow-ps/kv_array.h | 8 +++----- mshadow-ps/ps_dist-inl.h | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h index 8f4237b813e2..04bb4c0afa1c 100644 --- a/mshadow-ps/kv_array.h +++ b/mshadow-ps/kv_array.h @@ -16,7 +16,7 @@ class KVArray : public SharedParameter { void setArray(int key, V* data, size_t size) { val_[key] = SArray(data, size, false); } - void setUpdater(ICustomServer* updater) { + void setUpdater(mshadow::ps::ICustomServer* updater) { updater_ = updater; } @@ -28,11 +28,9 @@ class KVArray : public SharedParameter { void setValue(const MessagePtr& msg); protected: std::unordered_map> val_; - // an array is place into multiple servers only if its length > min_slice_size + // an array is placed into multiple servers only if its length > min_slice_size size_t min_slice_size_ = 1000; - - ICustomServer* updater_ = nullptr; - private: + mshadow::ps::ICustomServer* updater_ = nullptr; }; diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 7a4c466170dd..cb5ae3fe0e1f 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -81,17 +81,17 @@ class MShadowServer : public PS::App { public: // conf: get from the flag -app_conf MShadowServer(const std::string &conf) : App() { - updater_ = CreateUpdater_(); + updater_ = CreateServer(); updater_->Init(myRank(), conf); shared_model_ = new PS::KVArray(); shared_model_->setUpdater(updater_); } - virtual ~MShadowUpdater_() { + virtual ~MShadowServer() { delete updater_; delete shared_model_; } private: - ICustomUpdater_ *updater_; + ICustomServer *updater_; PS::KVArray* shared_model_; }; From 4f499572203e6830335a3165ae55aa6d90bd91e8 Mon Sep 17 00:00:00 2001 From: muli Date: Thu, 22 Jan 2015 22:21:34 -0500 Subject: [PATCH 120/147] minor --- mshadow-ps/kv_array.h | 4 ++-- mshadow-ps/ps.h | 27 ++++++++++++++------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h index 04bb4c0afa1c..c6403def61f7 100644 --- a/mshadow-ps/kv_array.h +++ b/mshadow-ps/kv_array.h @@ -8,8 +8,8 @@ DECLARE_string(app_name); template class KVArray : public SharedParameter { public: - KVArray(const string& my_name = FLAGS_app_name, - const string& parent_name = FLAGS_app_name + "_model") : + KVArray(const string& my_name = FLAGS_app_name + "_model", + const string& parent_name = FLAGS_app_name) : SharedParameter(my_name, parent_name) { } virtual ~KVArray() { } diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 34da246825d7..b2ee4970fd15 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -49,15 +49,15 @@ class IParamServer { */ virtual void SetParam(const char *name, const char *val) {} /*! - * \brief initialize the paramerver server client + * \brief initialize the paramerver server client * \param devices specifies the possible device id * to be input from Push and Pull, */ virtual void Init(const std::vector &devices) {} /*! - * \brief initialize the paramerver server client + * \brief initialize the paramerver server client * without specifying the devices, only device 0 is allowed - */ + */ inline void Init(void) { std::vector dev; dev.push_back(0); @@ -71,7 +71,7 @@ class IParamServer { * \param devid the device id this tensor lies in */ template - inline void InitKey(Shape shape, + inline void InitKey(Shape shape, int key, int devid) { this->InitKey_(shape.FlatTo2D(), key, devid); } @@ -95,7 +95,7 @@ class IParamServer { * the bigger the number is the higher the priority will be */ template - inline void Push(Tensor data, + inline void Push(Tensor data, int key, int devid = 0, int priority = 0) { @@ -117,7 +117,7 @@ class IParamServer { * \param callback_arg the argument to pass to callback */ template - inline void PullReq(Tensor data, + inline void PullReq(Tensor data, int key, int devid = 0, int priority = 0, @@ -128,9 +128,9 @@ class IParamServer { } #if __cplusplus >= 201103L template - inline void PullReq(Tensor data, + inline void PullReq(Tensor data, int key, - int devid, + int devid, int priority, std::function *stream)> callback) { // need to allocate space, because callback can happen latter.. @@ -147,7 +147,7 @@ class IParamServer { * this is unique per device * \param devid the device id this tensor lies in */ - virtual void InitKey_(Shape<2> shape, + virtual void InitKey_(Shape<2> shape, int key, int devid) = 0; /*! * \brief push out a tensor to parameter server @@ -185,7 +185,7 @@ class IParamServer { int priority, CallbackFunction callback, void *callback_arg) = 0; - + private: // C++11 support for lambda prepare function #if __cplusplus >= 201103L @@ -226,10 +226,10 @@ class ICustomServer { * \param dptr the data pointer * \param size size of the parameter key */ - virtual void Update(int key, DType *dptr, size_t size) = 0; + virtual void Update(int key, DType *dptr, size_t size) = 0; }; -/*! - * \brief create customized server +/*! + * \brief create customized server * this is a server defined by user * \return new server */ @@ -248,6 +248,7 @@ namespace ps { */ template inline IParamServer *Create(const char *type) { + printf("%s", type); if (!strcmp("local", type)) return new LocalServer(); #if MSHADOW_DIST_PS_ if (!strcmp("dist", type)) return new DistServer(); From bda087f5b9ada12398b182baa2320ca236e66f07 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 23 Jan 2015 13:47:38 -0800 Subject: [PATCH 121/147] chg --- mshadow-ps/ps_dist-inl.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index cb5ae3fe0e1f..523a2a951348 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -36,9 +36,20 @@ class DistServer : public LocalServer { } virtual ~DistServer(void) { } - - // remove custom, leave it empty - virtual void ServerInitKey(Tensor weight, int key) {} + virtual void ServerInitKey(Tensor weight, int key) { + // this is called when key get initialized for the first time + // weight can be used to hold the model that pulled back + // use this to initialize the key on serverside + MessagePtr pull_msg(new Message(kServerGroup)); + pull_msg->task.set_key_channel(key); + Range(0, weight.MSize()).to(pull_msg->task.mutable_key_range()); + shared_model_->setArray(key, weight.dptr_, weight.MSize()); + pull_msg->fin_handle = [this, weight, key]() { + // call PullReady to notify LocalServer pulling is ready + this->PullReady(weight, key); + }; + shared_model_->pull(pull_msg); + } // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, int key) { From 3184e528f404ae02efe09827a242fa38f9244458 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 23 Jan 2015 14:14:34 -0800 Subject: [PATCH 122/147] ok --- mshadow-ps/ps_dist-inl.h | 5 +++++ mshadow-ps/ps_local-inl.h | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index 523a2a951348..c20c5b492e17 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -36,6 +36,11 @@ class DistServer : public LocalServer { } virtual ~DistServer(void) { } + + protected: + // do nothing + virtual void InitCustomServer(void) { + } virtual void ServerInitKey(Tensor weight, int key) { // this is called when key get initialized for the first time // weight can be used to hold the model that pulled back diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index d3618e343734..379f727b867d 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -211,14 +211,7 @@ class LocalServer : public IParamServer { thread_pull_handler.resize(1); thread_pull_handler[0].Start(PullGlobalThread, this); } - if (update_on_server != 0) { - custom_server = CreateServer(); - for (size_t j = 0; j < cfgvec.size(); ++j) { - custom_server->SetParam(cfgvec[j].first.c_str(), - cfgvec[j].second.c_str()); - } - custom_server->Init(0, std::string()); - } + this->InitCustomServer(); this->init_end = 1; } @@ -311,6 +304,16 @@ class LocalServer : public IParamServer { } request_lock.Unlock(); } + virtual void InitCustomServer(void) { + if (update_on_server != 0) { + custom_server = CreateServer(); + for (size_t j = 0; j < cfgvec.size(); ++j) { + custom_server->SetParam(cfgvec[j].first.c_str(), + cfgvec[j].second.c_str()); + } + custom_server->Init(0, std::string()); + } + } virtual void ServerInitKey(Tensor weight, int key) { if (custom_server != NULL) { // intialize server, and ready for pullback From 94ca30a2d06d800e15ace9a52d01ec0821197071 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 23 Jan 2015 16:49:20 -0800 Subject: [PATCH 123/147] chang --- example/concat.cpp | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 example/concat.cpp diff --git a/example/concat.cpp b/example/concat.cpp deleted file mode 100644 index 16f5b77a0720..000000000000 --- a/example/concat.cpp +++ /dev/null @@ -1,33 +0,0 @@ -#include "mshadow/tensor.h" - -using namespace mshadow; -using namespace mshadow::expr; - - -void Print2D(const Tensor&t) { - for (int i = 0; i < t.size(0); ++i) { - for (int j = 0; j < t.size(1); ++j) { - printf("%.2f ", t[i][j]); - } - printf("\n"); - } -} - -int main() { - Tensor t1 = NewTensor(Shape4(2, 2, 3,2), 0.1f); - Tensor t2 = NewTensor(Shape4(2, 2, 3,2), 0.2f); - Tensor t3 = NewTensor(Shape4(2, 1, 3,2), 0.3f); - Tensor t = NewTensor(Shape4(2,5,3,2), 0.0f); - Tensor tr = NewTensor(Shape4(2,2,3,4), 0.0f); - t = concat<1>(t1, concat<1>(t2, t3)); - tr = concat<3>(t1, t2); - Print2D(t[0][2]); - Print2D(tr[0][2]); - t += 1.0f; - concat<1>(t1, concat<1>(t2, t3)) = t; - Print2D(t3[1][0]); - FreeSpace(&t1); - FreeSpace(&t2); - FreeSpace(&t3); - FreeSpace(&t); -} From 7d41305484c986776ca0a24c48a6442fcbb8d275 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 23 Jan 2015 16:49:26 -0800 Subject: [PATCH 124/147] chang --- example/Makefile | 7 +++---- example/basic.cpp | 5 ++--- example/basic_stream.cu | 1 - mshadow-ps/ps.h | 4 ++-- mshadow-ps/ps_dist-inl.h | 2 +- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/example/Makefile b/example/Makefile index 931b2fc39427..2c3c2358e4f9 100644 --- a/example/Makefile +++ b/example/Makefile @@ -2,15 +2,15 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -I/opt/intel/mkl/include -I/usr/local/cuda-6.0/include/ -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64 -L/usr/local/cuda-6.0/lib64 -export LDFLAGS= -lm -lcurand -lcublas +export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ +export LDFLAGS= -lm export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) # specify tensor path BIN = basic defop concat OBJ = CUOBJ = -CUBIN = basicx +CUBIN = .PHONY: clean all all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) @@ -18,7 +18,6 @@ all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) basic: basic.cpp concat: concat.cpp defop: defop.cpp -basicx: basic_gpu.cu $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/example/basic.cpp b/example/basic.cpp index 09f9dcbccc80..dc0e2e39d513 100644 --- a/example/basic.cpp +++ b/example/basic.cpp @@ -6,9 +6,8 @@ using namespace mshadow; using namespace mshadow::expr; int main(void) { - // intialize tensor engine before using tensor operation, needed for CuBLAS - //InitTensorEngine(); + InitTensorEngine(0); // assume we have a float space double data[20]; // create a 2 x 5 x 2 tensor, from existing space @@ -59,6 +58,6 @@ int main(void) { printf("\n"); } // shutdown tensor enigne after usage - //ShutdownTensorEngine(); + ShutdownTensorEngine(); return 0; } diff --git a/example/basic_stream.cu b/example/basic_stream.cu index 5687bdd5aae0..18dc64ed4c7f 100644 --- a/example/basic_stream.cu +++ b/example/basic_stream.cu @@ -6,7 +6,6 @@ using namespace mshadow; using namespace mshadow::expr; int main(void) { - // intialize tensor engine before using tensor operation, needed for CuBLAS InitTensorEngine(); // create a 2 x 5 tensor, from existing space diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index b2ee4970fd15..732389bade40 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -17,8 +17,8 @@ #include "../mshadow/tensor.h" /*! \brief whether to adapt distributed PS from parameter-server */ -#ifndef MSHADOW_DIST_PS_ -#define MSHADOW_DIST_PS_ 1 +#ifndef MSHADOW_DIST_PS +#define MSHADOW_DIST_PS 1 #endif namespace mshadow { diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index c20c5b492e17..a7f6f43ca4f5 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -10,7 +10,7 @@ #include "./ps.h" #include "./ps_local-inl.h" -#if MSHADOW_DIST_PS_ +#if MSHADOW_DIST_PS #include "./kv_array.h" #include "system/app.h" namespace mshadow { From 6726594040b41f44cd7585d0dc82715bd28afd10 Mon Sep 17 00:00:00 2001 From: muli Date: Fri, 23 Jan 2015 20:17:22 -0500 Subject: [PATCH 125/147] remove debug print --- mshadow-ps/kv_array.h | 15 +++++++++++---- mshadow-ps/ps.h | 1 - mshadow-ps/ps_dist-inl.h | 28 +++++++++++++++++++++------- mshadow-ps/ps_local-inl.h | 20 +++++++++++--------- 4 files changed, 43 insertions(+), 21 deletions(-) diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h index c6403def61f7..39f316a7c55d 100644 --- a/mshadow-ps/kv_array.h +++ b/mshadow-ps/kv_array.h @@ -67,9 +67,16 @@ template void KVArray::getValue(const MessagePtr& msg) { auto& my_val = val_[msg->task.key_channel()]; Range kr(msg->task.key_range()); - CHECK_GE(my_val.size(), kr.end()); + if (my_val.empty()) { + // initialize weight + my_val.resize(kr.size(), 0); + CHECK_NOTNULL(updater_)->InitKey(msg->task.key_channel(), my_val.data(), my_val.size()); + } + + // TODO store the kr in memory + CHECK_EQ(my_val.size(), kr.size()); SArray send_data(kr.size()); - send_data.copyFrom(my_val.segment(kr)); + send_data.copyFrom(my_val); msg->addValue(send_data); } @@ -106,8 +113,8 @@ MessagePtrList KVArray::slice(const MessagePtr& msg, const KeyRangeList& krs) CHECK_EQ(data.size(), kr.size()); for (size_t j = 0; j < n; ++j) { if (ret[j]->valid) { - Range kr(ret[i]->task.key_range()); - ret[i]->addValue(data.segment(kr)); + Range kr(ret[j]->task.key_range()); + ret[j]->addValue(data.segment(kr)); } } } diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index b2ee4970fd15..7318ce472757 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -248,7 +248,6 @@ namespace ps { */ template inline IParamServer *Create(const char *type) { - printf("%s", type); if (!strcmp("local", type)) return new LocalServer(); #if MSHADOW_DIST_PS_ if (!strcmp("dist", type)) return new DistServer(); diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index cb5ae3fe0e1f..1fad69646a91 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -21,10 +21,6 @@ class DistServer : public LocalServer { // parent type typedef LocalServer Parent; - // virtual void SetParam(const char *name, const char *val) { - // Parent::SetParam(name, val); - // } - // initialize the parameter server virtual void Init(const std::vector &devices) { Parent::Init(devices); @@ -37,8 +33,26 @@ class DistServer : public LocalServer { virtual ~DistServer(void) { } + protected: + virtual void InitCustomerServer(void) { + } // remove custom, leave it empty - virtual void ServerInitKey(Tensor weight, int key) {} + virtual void ServerInitKey(Tensor weight, int key) { + // this is called when key get initialized for the first time + // weight can be used to hold the model that pulled back + // use this to initialize the key on serverside + using namespace PS; + MessagePtr pull_msg(new Message(kServerGroup)); + pull_msg->task.set_key_channel(key); + Range(0, weight.MSize()).to(pull_msg->task.mutable_key_range()); + shared_model_->setArray(key, weight.dptr_, weight.MSize()); + pull_msg->fin_handle = [this, weight, key]() { + // call PullReady to notify LocalServer pulling is ready + this->PullReady(weight, key); + }; + shared_model_->pull(pull_msg); + } + // override this function, to use parameter server virtual void HandlePushFinish(Tensor data, int key) { @@ -51,11 +65,10 @@ class DistServer : public LocalServer { Tensor sendrecv = data[0]; using namespace PS; utils::Assert(data[0].CheckContiguous(), "data must be contiguous"); - // TODO the zero copy version - // SArray val(data.dptr_, data.MSize(), false); SArray val; val.copyFrom(sendrecv.dptr_, sendrecv.MSize()); MessagePtr push_msg(new Message(kServerGroup)); push_msg->addValue(val); + // LL << val; push_msg->task.set_key_channel(key); Range(0, val.size()).to(push_msg->task.mutable_key_range()); int push_time = CHECK_NOTNULL(shared_model_)->push(push_msg); @@ -82,6 +95,7 @@ class MShadowServer : public PS::App { // conf: get from the flag -app_conf MShadowServer(const std::string &conf) : App() { updater_ = CreateServer(); + updater_->Init(myRank(), conf); shared_model_ = new PS::KVArray(); shared_model_->setUpdater(updater_); diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index d3618e343734..df2a05c797f7 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -20,7 +20,6 @@ typedef unsigned ms_omp_uint; #include "./thread.h" #include "./thread_util.h" -#include "./ps.h" namespace mshadow { namespace ps { @@ -211,14 +210,7 @@ class LocalServer : public IParamServer { thread_pull_handler.resize(1); thread_pull_handler[0].Start(PullGlobalThread, this); } - if (update_on_server != 0) { - custom_server = CreateServer(); - for (size_t j = 0; j < cfgvec.size(); ++j) { - custom_server->SetParam(cfgvec[j].first.c_str(), - cfgvec[j].second.c_str()); - } - custom_server->Init(0, std::string()); - } + this->InitCustomerServer(); this->init_end = 1; } @@ -355,6 +347,16 @@ class LocalServer : public IParamServer { } } + virtual void InitCustomerServer(void) { + if (update_on_server != 0) { + custom_server = CreateServer(); + for (size_t j = 0; j < cfgvec.size(); ++j) { + custom_server->SetParam(cfgvec[j].first.c_str(), + cfgvec[j].second.c_str()); + } + custom_server->Init(0, std::string()); + } + } protected: // customized server ICustomServer *custom_server; From c91dca4ae9389db5714bf8854deb7f9bb11bd4e5 Mon Sep 17 00:00:00 2001 From: muli Date: Sat, 24 Jan 2015 13:39:52 -0500 Subject: [PATCH 126/147] refactor --- mshadow-ps/kv_array.h | 4 ++-- mshadow-ps/ps.h | 18 +++++++++--------- mshadow-ps/ps_dist-inl.h | 16 ++++++++-------- mshadow-ps/ps_local-inl.h | 32 ++++++++++++++++---------------- mshadow-ps/test.cpp | 4 ++-- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h index 39f316a7c55d..2bf10a47b226 100644 --- a/mshadow-ps/kv_array.h +++ b/mshadow-ps/kv_array.h @@ -16,7 +16,7 @@ class KVArray : public SharedParameter { void setArray(int key, V* data, size_t size) { val_[key] = SArray(data, size, false); } - void setUpdater(mshadow::ps::ICustomServer* updater) { + void setUpdater(mshadow::ps::IModelUpdater* updater) { updater_ = updater; } @@ -30,7 +30,7 @@ class KVArray : public SharedParameter { std::unordered_map> val_; // an array is placed into multiple servers only if its length > min_slice_size size_t min_slice_size_ = 1000; - mshadow::ps::ICustomServer* updater_ = nullptr; + mshadow::ps::IModelUpdater* updater_ = nullptr; }; diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 8989364092e1..5fa99109a505 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -30,7 +30,7 @@ namespace ps { */ template -class IParamServer { +class ISharedModel { public: /*! * \brief callback function that will be executed when pull request finishes @@ -41,7 +41,7 @@ class IParamServer { */ typedef void (CallbackFunction) (Stream *stream, void *arg); /*! \brief virtual destructor */ - virtual ~IParamServer(void) {} + virtual ~ISharedModel(void) {} /*! * \brief Set param for the layer from string * \param name parameter name @@ -199,9 +199,9 @@ class IParamServer { }; /*! \brief interface for customized mshadow server */ template -class ICustomServer { +class IModelUpdater { public: - virtual ~ICustomServer(void) {} + virtual ~IModelUpdater(void) {} /*! * \brief set parameters from outside * \param name name of parameter @@ -234,7 +234,7 @@ class ICustomServer { * \return new server */ template -ICustomServer *CreateServer(void); +IModelUpdater *CreateModelUpdater(void); } // namespace ps } // namespace mshadow @@ -247,10 +247,10 @@ namespace ps { * \param type the type of paramerver server */ template -inline IParamServer *Create(const char *type) { - if (!strcmp("local", type)) return new LocalServer(); -#if MSHADOW_DIST_PS_ - if (!strcmp("dist", type)) return new DistServer(); +inline ISharedModel *CreateSharedModel(const char *type) { + if (!strcmp("local", type)) return new LocalModel(); +#if MSHADOW_DIST_PS + if (!strcmp("dist", type)) return new DistModel(); #endif utils::Error("unknown server type %s\n", type); return NULL; diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index f4ee32864058..faea92bd71da 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -16,10 +16,10 @@ namespace mshadow { namespace ps { template -class DistServer : public LocalServer { +class DistModel : public LocalModel { public: // parent type - typedef LocalServer Parent; + typedef LocalModel Parent; // initialize the parameter server virtual void Init(const std::vector &devices) { @@ -30,7 +30,7 @@ class DistServer : public LocalServer { this->custom_server = NULL; } } - virtual ~DistServer(void) { + virtual ~DistModel(void) { } protected: @@ -89,22 +89,22 @@ class DistServer : public LocalServer { }; template -class MShadowServer : public PS::App { +class MShadowServerNode : public PS::App { public: // conf: get from the flag -app_conf - MShadowServer(const std::string &conf) : App() { - updater_ = CreateServer(); + MShadowServerNode(const std::string &conf) : App() { + updater_ = CreateModelUpdater(); updater_->Init(myRank(), conf); shared_model_ = new PS::KVArray(); shared_model_->setUpdater(updater_); } - virtual ~MShadowServer() { + virtual ~MShadowServerNode() { delete updater_; delete shared_model_; } private: - ICustomServer *updater_; + IModelUpdater *updater_; PS::KVArray* shared_model_; }; diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index df2a05c797f7..d6c9a2655019 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -25,13 +25,13 @@ namespace mshadow { namespace ps { // multi-threaded implementation of template -class LocalServer : public IParamServer { +class LocalModel : public ISharedModel { public: // redefine callback function - typedef typename IParamServer::CallbackFunction + typedef typename ISharedModel::CallbackFunction CallbackFunction; // constructor - LocalServer(void) { + LocalModel(void) { init_end = 0; perdev_pull_thread = 1; perdev_push_thread = 1; @@ -42,7 +42,7 @@ class LocalServer : public IParamServer { custom_server = NULL; } // destructor - virtual ~LocalServer(void) { + virtual ~LocalModel(void) { if (init_end != 0) { destroy_signal = true; for (size_t i = 0; i < push_queues.size(); ++i) { @@ -188,8 +188,8 @@ class LocalServer : public IParamServer { if (perdev_push_thread != 0) { thread_push_handler.resize(devices.size()); for (size_t i = 0; i < devices.size(); ++i) { - std::pair *p - = new std::pair(); + std::pair *p + = new std::pair(); *p = std::make_pair(this, i); thread_push_handler[i].Start(PushLocalThread, p); } @@ -201,8 +201,8 @@ class LocalServer : public IParamServer { if (perdev_pull_thread != 0) { thread_pull_handler.resize(devices.size()); for (size_t i = 0; i < devices.size(); ++i) { - std::pair *p - = new std::pair(); + std::pair *p + = new std::pair(); *p = std::make_pair(this, i); thread_pull_handler[i].Start(PullLocalThread, p); } @@ -349,7 +349,7 @@ class LocalServer : public IParamServer { virtual void InitCustomerServer(void) { if (update_on_server != 0) { - custom_server = CreateServer(); + custom_server = CreateModelUpdater(); for (size_t j = 0; j < cfgvec.size(); ++j) { custom_server->SetParam(cfgvec[j].first.c_str(), cfgvec[j].second.c_str()); @@ -359,7 +359,7 @@ class LocalServer : public IParamServer { } protected: // customized server - ICustomServer *custom_server; + IModelUpdater *custom_server; private: /*! \brief task running */ struct PullTask { @@ -601,13 +601,13 @@ class LocalServer : public IParamServer { } /*!\brief entry point of loader thread */ inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) { - static_cast(pthread)->PushHandlerGlobal(); + static_cast(pthread)->PushHandlerGlobal(); utils::ThreadExit(NULL); return NULL; } inline static MSHADOW_THREAD_PREFIX PushLocalThread(void *arg) { - std::pair *p - = static_cast*>(arg); + std::pair *p + = static_cast*>(arg); p->first->PushHandlerLocal(p->second); delete p; utils::ThreadExit(NULL); @@ -680,13 +680,13 @@ class LocalServer : public IParamServer { } /*!\brief entry point of pull thread, one thread for all devices */ inline static MSHADOW_THREAD_PREFIX PullGlobalThread(void *arg) { - static_cast(arg)->PullHandlerGlobal(); + static_cast(arg)->PullHandlerGlobal(); utils::ThreadExit(NULL); return NULL; } inline static MSHADOW_THREAD_PREFIX PullLocalThread(void *arg) { - std::pair *p - = static_cast*>(arg); + std::pair *p + = static_cast*>(arg); p->first->PullHandlerLocal(p->second); delete p; utils::ThreadExit(NULL); diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp index 1906f87e4429..aebbaf2906e7 100644 --- a/mshadow-ps/test.cpp +++ b/mshadow-ps/test.cpp @@ -16,9 +16,9 @@ void Print2DTensor(Tensor const &ts) { int main(int argc, char *argv[]) { if (argc < 2) { printf("Usage:\n"); return 0; - } + } int ndev = atoi(argv[1]); - ps::IParamServer *ps = ps::Create("local"); + ps::ISharedModel *ps = ps::CreateSharedModel("local"); TensorContainer ts(Shape3(ndev,5,2)); TensorContainer res(Shape3(ndev,5,2)); std::vector devs; From bf119898afa8f83323512bafca30817ff8d3a6d3 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 24 Jan 2015 20:04:53 -0800 Subject: [PATCH 127/147] update thread --- mshadow-ps/test.cpp | 6 ++++++ mshadow-ps/thread.h | 20 ++++++++++++++++++++ mshadow/base.h | 17 ++++++++++++++--- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp index aebbaf2906e7..4024ec356ba2 100644 --- a/mshadow-ps/test.cpp +++ b/mshadow-ps/test.cpp @@ -1,3 +1,9 @@ +#define MSHADOW_STAND_ALONE 1 +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#define NOMINMAX +#endif #include "./ps.h" using namespace mshadow; void Print1DTensor(Tensor const &ts) { diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h index 7451f12052b0..288b56228952 100644 --- a/mshadow-ps/thread.h +++ b/mshadow-ps/thread.h @@ -30,6 +30,26 @@ class Semaphore { private: HANDLE sem; }; +/*! \brief mutex under windows */ +class Mutex { + public: + inline void Init(void) { + pthread_mutex_init(&mutex, NULL); + } + inline void Lock(void) { + pthread_mutex_lock(&mutex); + } + inline void Unlock(void) { + pthread_mutex_unlock(&mutex); + } + inline void Destroy(void) { + pthread_mutex_destroy(&mutex); + } + + private: + friend class ConditionVariable; + pthread_mutex_t mutex; +}; /*! \brief simple thread that wraps windows thread */ class Thread { private: diff --git a/mshadow/base.h b/mshadow/base.h index b031859a6e78..ddd97a9d0407 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -7,6 +7,11 @@ */ #ifndef MSHADOW_BASE_H_ #define MSHADOW_BASE_H_ +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#define NOMINMAX +#endif #include #include #include @@ -116,13 +121,19 @@ extern "C" { #ifdef MSHADOW_XINLINE #error "MSHADOW_XINLINE must not be defined" #endif +#ifdef _MSC_VER +#define MSHADOW_FORCE_INLINE __forceinline +#pragma warning( disable : 4068 ) +#else +#define MSHADOW_FORCE_INLINE inline __attribute__((always_inline)) +#endif #ifdef __CUDACC__ - #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__ + #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE __device__ __host__ #else - #define MSHADOW_XINLINE inline __attribute__((always_inline)) + #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE #endif /*! \brief cpu force inline */ -#define MSHADOW_CINLINE inline __attribute__((always_inline)) +#define MSHADOW_CINLINE MSHADOW_FORCE_INLINE #if defined(__GXX_EXPERIMENTAL_CXX0X) ||\ defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L From 647a6e51620a25f945525d5d7c87409d2f691fdd Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 24 Jan 2015 20:43:04 -0800 Subject: [PATCH 128/147] bug fix under msvc, thanks to windows --- mshadow-ps/test.cpp | 17 +++++++++++------ mshadow-ps/thread.h | 42 ++++++++++++++++++++++++++++++++++++------ mshadow/tensor.h | 4 +--- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp index 4024ec356ba2..51467bbe0918 100644 --- a/mshadow-ps/test.cpp +++ b/mshadow-ps/test.cpp @@ -4,6 +4,7 @@ #define _CRT_SECURE_NO_DEPRECATE #define NOMINMAX #endif +#define MSHADOW_DIST_PS 0 #include "./ps.h" using namespace mshadow; void Print1DTensor(Tensor const &ts) { @@ -30,18 +31,14 @@ int main(int argc, char *argv[]) { std::vector devs; for (int i = 0; i < ndev; ++i) { devs.push_back(i); - ts[i] = 1.0 + i; + //ts[i] = static_cast(1.0 + i); } ps->Init(devs); for (int i = 0; i < ndev; ++i) { ps->Push(ts[i], 3, i); int a = i; ps->PullWait(3, i); - ps->PullReq(res[i], 3, i, 0, [&](Stream *stream) { - printf("hello i=%d, a=%d,remember during callback, do not take local varaible.. \n", i, a); - ts += 1.0f; - } - ); + ps->PullReq(res[i], 3, i, 0); } for (int i = 0; i < ndev; ++i) { ps->PullWait(3, i); @@ -51,3 +48,11 @@ int main(int argc, char *argv[]) { } return 0; } +namespace mshadow { +namespace ps { +template<> +mshadow::ps::IModelUpdater *CreateModelUpdater(){ + return NULL; +} +} +} \ No newline at end of file diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h index 288b56228952..382e17a447bf 100644 --- a/mshadow-ps/thread.h +++ b/mshadow-ps/thread.h @@ -25,31 +25,61 @@ class Semaphore { utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error"); } inline void Post(void) { - utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error"); + utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error"); } private: HANDLE sem; }; + /*! \brief mutex under windows */ class Mutex { public: inline void Init(void) { - pthread_mutex_init(&mutex, NULL); + utils::Check(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400) != 0, + "Mutex::Init fail"); } inline void Lock(void) { - pthread_mutex_lock(&mutex); + EnterCriticalSection(&mutex); } inline void Unlock(void) { - pthread_mutex_unlock(&mutex); + LeaveCriticalSection(&mutex); } inline void Destroy(void) { - pthread_mutex_destroy(&mutex); + DeleteCriticalSection(&mutex); } private: friend class ConditionVariable; - pthread_mutex_t mutex; + CRITICAL_SECTION mutex; }; + +// conditional variable that uses pthread +class ConditionVariable { + public: + // initialize conditional variable + inline void Init(void) { + InitializeConditionVariable(&cond); + } + // destroy the thread + inline void Destroy(void) { + //DeleteConditionVariable(&cond); + } + // wait on the conditional variable + inline void Wait(Mutex *mutex) { + utils::Check(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE) != 0, + "ConditionVariable:Wait fail"); + } + inline void Broadcast(void) { + WakeAllConditionVariable(&cond); + } + inline void Signal(void) { + WakeConditionVariable(&cond); + } + + private: + CONDITION_VARIABLE cond; +}; + /*! \brief simple thread that wraps windows thread */ class Thread { private: diff --git a/mshadow/tensor.h b/mshadow/tensor.h index d54813a49b19..7871697f5c36 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -344,10 +344,9 @@ struct Tensor: public TRValue, s, stride_, stream_); } /*!\brief implement the assignment of same type */ - template inline Tensor & operator=(const Tensor &exp) { - dptr_ = exp.dptr; + dptr_ = exp.dptr_; shape_ = exp.shape_; stride_ = exp.stride_; stream_ = exp.stream_; @@ -410,7 +409,6 @@ struct Tensor: return dptr_[idx]; } /*!\brief implement the assignment of same type */ - template inline Tensor & operator=(const Tensor &exp) { dptr_ = exp.dptr; From 04fb2170897c87895324d649445e9f47335a34b5 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 17:16:17 -0800 Subject: [PATCH 129/147] move makefile config to mshadow --- example/basic.cpp | 63 ------------------- guide/.gitignore | 2 + {example => guide}/Makefile | 11 ++-- guide/basic.cpp | 41 ++++++++++++ {example => guide}/basic_stream.cu | 0 guide/config.mk | 35 +++++++++++ {example => guide}/defop.cpp | 23 +++---- {example => guide}/exp-template/Makefile | 0 {example => guide}/exp-template/README.md | 0 {example => guide}/exp-template/exp_lazy.cpp | 0 .../exp-template/exp_template.cpp | 0 .../exp-template/exp_template_op.cpp | 0 {example => guide}/neuralnet/Makefile | 0 {example => guide}/neuralnet/README.md | 0 {example => guide}/neuralnet/convnet.cu | 0 {example => guide}/neuralnet/nnet.cu | 0 {example => guide}/neuralnet/run.sh | 0 {example => guide}/neuralnet/util.h | 0 make/README.md | 17 +++++ make/mshadow.mk | 59 +++++++++++++++++ mshadow/base.h | 4 -- 21 files changed, 172 insertions(+), 83 deletions(-) delete mode 100644 example/basic.cpp create mode 100644 guide/.gitignore rename {example => guide}/Makefile (72%) create mode 100644 guide/basic.cpp rename {example => guide}/basic_stream.cu (100%) create mode 100644 guide/config.mk rename {example => guide}/defop.cpp (68%) rename {example => guide}/exp-template/Makefile (100%) rename {example => guide}/exp-template/README.md (100%) rename {example => guide}/exp-template/exp_lazy.cpp (100%) rename {example => guide}/exp-template/exp_template.cpp (100%) rename {example => guide}/exp-template/exp_template_op.cpp (100%) rename {example => guide}/neuralnet/Makefile (100%) rename {example => guide}/neuralnet/README.md (100%) rename {example => guide}/neuralnet/convnet.cu (100%) rename {example => guide}/neuralnet/nnet.cu (100%) rename {example => guide}/neuralnet/run.sh (100%) rename {example => guide}/neuralnet/util.h (100%) create mode 100644 make/README.md create mode 100644 make/mshadow.mk diff --git a/example/basic.cpp b/example/basic.cpp deleted file mode 100644 index dc0e2e39d513..000000000000 --- a/example/basic.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// header file to use mshadow -#include "mshadow/tensor.h" -// this namespace contains all data structures, functions -using namespace mshadow; -// this namespace contains all operator overloads -using namespace mshadow::expr; - -int main(void) { - // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(0); - // assume we have a float space - double data[20]; - // create a 2 x 5 x 2 tensor, from existing space - Tensor ts(data, Shape3(2,5,2)); - Tensor ts4(data, Shape4(2,2,2,2)); - // take first subscript of the tensor - Tensor mat = ts[0]; - // Tensor object is only a handle, assignment means they have same data content - Tensor mat2= NewTensor(Shape1(2), 0.0f); - Tensor ts1= NewTensor(ts.shape_, 0.0f); - Random rnd(0); - ts.stream_ = NewStream(); - DeleteStream(ts.stream_); - - mat2 = rnd.uniform(mat2.shape_); - // shape of matrix, note shape order is different from numpy - // shape[i] indicate the shape of i-th dimension - printf("%u X %u matrix, stride=%u\n", mat.size(0), mat.size(1), mat.stride_); - - // assign some values - mat[0][1] = 1.0f; mat[1][0] = 2.0f; - // elementwise operations - - //ts = broadcast<0>(mat2, ts.shape_); - mat2 = sumall_except_dim<0>(mat); - // print out matrix, note: mat2 and mat1 are handles(pointers) - for (index_t c = 0; c < ts.size(0); ++c) { - for (index_t i = 0; i < mat.size(0); ++i) { - for (index_t j = 0; j < mat.size(1); ++j) { - printf("%.2f ", ts[c][i][j]); - } - printf("\n"); - } - } - // create a tensor without explictly allocating spaces. - Tensor mat3 = NewTensor(Shape2(2, 5), 0.0f); - Tensor mat4 = NewTensor(Shape2(2, 5), 1.0f); - // transpose, and then add mat4. - mat3 = tcast(mat.T()) + mat4; - - // index the shape using size(), this is more natural for MATLAB/numpy user. - printf("%u X %u matrix\n", mat3.size(0), mat3.size(1)); - // print out matrix - for (index_t i = 0; i < mat3.size(0); ++i) { - for (index_t j = 0; j < mat3.size(1); ++j) { - printf("%.2f ", mat3[i][j]); - } - printf("\n"); - } - // shutdown tensor enigne after usage - ShutdownTensorEngine(); - return 0; -} diff --git a/guide/.gitignore b/guide/.gitignore new file mode 100644 index 000000000000..f4ccede58e76 --- /dev/null +++ b/guide/.gitignore @@ -0,0 +1,2 @@ +defop +basic \ No newline at end of file diff --git a/example/Makefile b/guide/Makefile similarity index 72% rename from example/Makefile rename to guide/Makefile index 2c3c2358e4f9..930867bb7bf2 100644 --- a/example/Makefile +++ b/guide/Makefile @@ -2,12 +2,14 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -export LDFLAGS= -lm -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) +include config.mk +include ../make/mshadow.mk +export CFLAGS = -Wall -O3 -I../ $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) # specify tensor path -BIN = basic defop concat +BIN = basic defop OBJ = CUOBJ = CUBIN = @@ -16,7 +18,6 @@ CUBIN = all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) basic: basic.cpp -concat: concat.cpp defop: defop.cpp $(BIN) : diff --git a/guide/basic.cpp b/guide/basic.cpp new file mode 100644 index 000000000000..260c6054d44b --- /dev/null +++ b/guide/basic.cpp @@ -0,0 +1,41 @@ +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // assume we have a float space + float data[20]; + // create a 2 x 5 x 2 tensor, from existing space + Tensor ts(data, Shape3(2,5,2)); + // take first subscript of the tensor + Tensor mat = ts[0]; + // Tensor object is only a handle, assignment means they have same data content + // we can specify content type of a Tensor, if not specified, it is float bydefault + Tensor mat2 = mat; + + // shaape of matrix, note size order is same as numpy + printf("%u X %u matrix\n", mat.size(1), mat.size(1)); + + // initialize all element to zero + mat = 0.0f; + // assign some values + mat[0][1] = 1.0f; mat[1][0] = 2.0f; + // elementwise operations + mat += (mat + 10.0f) / 10.0f + 2.0f; + + // print out matrix, note: mat2 and mat1 are handles(pointers) + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { + printf("%.2f ", mat2[i][j]); + } + printf("\n"); + } + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} diff --git a/example/basic_stream.cu b/guide/basic_stream.cu similarity index 100% rename from example/basic_stream.cu rename to guide/basic_stream.cu diff --git a/guide/config.mk b/guide/config.mk new file mode 100644 index 000000000000..bcd30bc69a3c --- /dev/null +++ b/guide/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 0 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas +USE_BLAS = blas +# +# add path to intel libary, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/example/defop.cpp b/guide/defop.cpp similarity index 68% rename from example/defop.cpp rename to guide/defop.cpp index d78616213353..5843f75842de 100644 --- a/example/defop.cpp +++ b/guide/defop.cpp @@ -7,40 +7,41 @@ using namespace mshadow; using namespace mshadow::expr; // user defined unary operator addone -struct addone{ +struct addone { + // map can be template function template MSHADOW_XINLINE static DType Map(DType a) { return a + static_cast(1); } }; // user defined binary operator max of two -struct maxoftwo{ - template - MSHADOW_XINLINE static DType Map(DType a, DType b) { +struct maxoftwo { + // map can also be normal functions, + // however, this can only be applied to float tensor + MSHADOW_XINLINE static float Map(float a, float b) { if(a > b) return a; else return b; } }; -int main(void){ +int main(void) { // intialize tensor engine before using tensor operation, needed for CuBLAS - //InitTensorEngine(); + InitTensorEngine(); // take first subscript of the tensor Tensor mat = NewTensor(Shape2(2,3), 0.0f); Tensor mat2= NewTensor(Shape2(2,3), 0.0f); mat[0][0] = -2.0f; - mat = F(F(mat) + 1.0f, mat2); + mat = F(F(mat) + 0.5f, mat2); - for(index_t i = 0; i < mat.size(0); ++i){ - for(index_t j = 0; j < mat.size(1); ++j){ + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { printf("%.2f ", mat[i][j]); } printf("\n"); } - FreeSpace(&mat); FreeSpace(&mat2); // shutdown tensor enigne after usage - //ShutdownTensorEngine(); + ShutdownTensorEngine(); return 0; } diff --git a/example/exp-template/Makefile b/guide/exp-template/Makefile similarity index 100% rename from example/exp-template/Makefile rename to guide/exp-template/Makefile diff --git a/example/exp-template/README.md b/guide/exp-template/README.md similarity index 100% rename from example/exp-template/README.md rename to guide/exp-template/README.md diff --git a/example/exp-template/exp_lazy.cpp b/guide/exp-template/exp_lazy.cpp similarity index 100% rename from example/exp-template/exp_lazy.cpp rename to guide/exp-template/exp_lazy.cpp diff --git a/example/exp-template/exp_template.cpp b/guide/exp-template/exp_template.cpp similarity index 100% rename from example/exp-template/exp_template.cpp rename to guide/exp-template/exp_template.cpp diff --git a/example/exp-template/exp_template_op.cpp b/guide/exp-template/exp_template_op.cpp similarity index 100% rename from example/exp-template/exp_template_op.cpp rename to guide/exp-template/exp_template_op.cpp diff --git a/example/neuralnet/Makefile b/guide/neuralnet/Makefile similarity index 100% rename from example/neuralnet/Makefile rename to guide/neuralnet/Makefile diff --git a/example/neuralnet/README.md b/guide/neuralnet/README.md similarity index 100% rename from example/neuralnet/README.md rename to guide/neuralnet/README.md diff --git a/example/neuralnet/convnet.cu b/guide/neuralnet/convnet.cu similarity index 100% rename from example/neuralnet/convnet.cu rename to guide/neuralnet/convnet.cu diff --git a/example/neuralnet/nnet.cu b/guide/neuralnet/nnet.cu similarity index 100% rename from example/neuralnet/nnet.cu rename to guide/neuralnet/nnet.cu diff --git a/example/neuralnet/run.sh b/guide/neuralnet/run.sh similarity index 100% rename from example/neuralnet/run.sh rename to guide/neuralnet/run.sh diff --git a/example/neuralnet/util.h b/guide/neuralnet/util.h similarity index 100% rename from example/neuralnet/util.h rename to guide/neuralnet/util.h diff --git a/make/README.md b/make/README.md new file mode 100644 index 000000000000..8fa257564b35 --- /dev/null +++ b/make/README.md @@ -0,0 +1,17 @@ +Makefile Configuration of MShadow +===== +This folder contains Makefile configuration of mshadow. MShadow is a template library, +you only need to include mshadow to use it. + +You can compile mshadow with different mode, for example, with or without CUDA. There are different compile flags +that you might need to set in your own configuration, and this folder provides a Makefile script to help you do that. + +Usage +===== +* Set the configurations via variables in your Makefile, see example in [../guide/config.mk](../guide/config.mk) +* include [mshadow.mk](mshadow.mk) in your Makefile +* mshadow.mk will give you compiler variables that you can include when compiling + - Add MSHADOW_CFLAGS to the compile flags + - Add MSHADOW_LDFLAGS to the linker flags + - Add MSHADOW_NVCCFLAGS to the nvcc compile flags +* For example Makefile, see [../guide/Makefile](../guide/Makefile) diff --git a/make/mshadow.mk b/make/mshadow.mk new file mode 100644 index 000000000000..f4419f6c9f70 --- /dev/null +++ b/make/mshadow.mk @@ -0,0 +1,59 @@ +#--------------------------------------------------------------------------------------- +# mshadow configuration script +# +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +MSHADOW_CFLAGS = -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas +MSHADOW_LDFLAGS = -lm +MSHADOW_NVCCFLAGS = + +ifeq ($(USE_CUDA), 0) + MSHADOW_CFLAGS += -DMSHADOW_USE_CUDA=0 +else + MSHADOW_LDFLAGS += -lcudart -lcublas -lcurand +endif +ifneq ($(USE_CUDA_PATH), NONE) + MSHADOW_CFLAGS += -I$(USE_CUDA_PATH)/include + MSHADOW_LDFLAGS += -L$(USE_CUDA_PATH)/lib64 +endif + +ifeq ($(USE_BLAS), mkl) +ifneq ($(USE_INTEL_PATH), NONE) + MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/mkl/lib/intel64 + MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/lib/intel64 + MSHADOW_CFLAGS += -I$(USE_INTEL_PATH)/mkl/include +endif + MSHADOW_LDFLAGS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 +else + MSHADOW_CFLAGS += -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 +endif +ifeq ($(USE_BLAS), openblas) + MSHADOW_LDFLAGS += -lopenblas +else ifeq ($(USE_BLAS), atlas) + MSHADOW_LDFLAGS += -lcblas +else ifeq ($(USE_BLAS), blas) + MSHADOW_LDFLAGS += -lblas +endif + +ifeq ($(PS_PATH), NONE) +PS_PATH = .. +endif +ifeq ($(PS_THIRD_PATH), NONE) +PS_THIRD_PATH = $(PS_PATH)/third_party +endif + +ifeq ($(USE_DIST_PS),1) +MSHADOW_CFLAGS += -DMSHADOW_DIST_PS=1 -std=c++11 \ + -I$(PS_PATH)/src -I$(PS_THIRD_PATH)/include +PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libpsmain.a) \ + $(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \ + libglog.a libz.a libsnappy.a) +MSHADOW_NVCCFLAGS += --std=c++11 +else + MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0 +endif diff --git a/mshadow/base.h b/mshadow/base.h index ddd97a9d0407..6a10c1f18a0a 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -79,10 +79,6 @@ #define MSHADOW_OLD_CUDA 0 #endif -/*! \brief use single precition float */ -#ifndef MSHADOW_SINGLE_PRECISION - #define MSHADOW_SINGLE_PRECISION 1 -#endif /*! \brief whether use SSE */ #ifndef MSHADOW_USE_SSE #define MSHADOW_USE_SSE 1 From 2c56c9c844e13260dac6de0a16fa45849cac7216 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 18:02:56 -0800 Subject: [PATCH 130/147] standalone doc --- README.md | 45 +++------- guide/README.md | 217 ++++++++++++++++++++++++++++++++++++++++++++++++ make/README.md | 9 +- 3 files changed, 233 insertions(+), 38 deletions(-) create mode 100644 guide/README.md diff --git a/README.md b/README.md index 8cc278707b64..8f1cf327994e 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,21 @@ mshadow: Matrix Shadow ====== -Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA - -Creater: Bing Xu and Tianqi Chen - - -Documentation and Tutorial: https://github.com/tqchen/mshadow/wiki - -Description -===== -Most machine learning algorithms requires matrix,tensor operations frequently. For example, Eq.(1) is a common SGD update rule, where the weight can be a vector, matrix or 3D tensor. Eq.(2) is the backpropagtion rule: -``` -(1) weight = - eta * ( grad + lambda * weight ); -(2) gradin = dot( gradout, netweight.T() ); -``` - -These operations are not hard to implement, even in C++. The first one is elementwise operations, and can easily be written as -``` -for( int index = 0; index < weight.length; index ++ ){ - weight[index] = - eta * ( grad[index] + lambda * weight[index] ); -} -``` -Eq.(2) is matrix product, and we can use standard BLAS packages such as Intel MKL. It will looklike -``` -sgemm( CblasNoTrans, CblasTrans, n, m, k, 1.0, gradout.ptr, lda, netweight.ptr, ldb, 0.0, gradin.ptr, ldc ); -``` -However: - -* It is annoying to write these codes repeatively, and they are not intuitive. -* What if we want to port our code to GPU? We need to rewrite our code in CUDA - -mshadow is a unified C++/CUDA lib to to write Eq.(1) and Eq.(2) in C++, and *translate* them to the for loop and standard packages such as MKL, CuBLAS *in compile time*. +MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The goal of mshadow is to support ***efficient***, +***device invariant*** and ***simple*** tensor library for machine learning project that aims for both simplicity and performance. +* [Contributors](https://github.com/tqchen/mshadow/graphs/contributors) +* [Tutorial](guide) Features ===== -* Shadow instead of giant: mshadow does not implement all of the functions, it is more of a wrapper to translated easy-to-read code to standard 'giant' packages such as MKL -* Whitebox instead of blackbox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called -* Unified CPU/GPU code: write a code and it should run in both CPU and GPU +* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code in ***compile time***. + - No temporal memory allocation will happen for expression you write + - mshadow will generate specific kernel for every expression you write in compile time. +* Device invariant: you can write +* Simple: mshadow allows you to write machine learning code using expressions. + - For example: ```weight = -eta * (grad + lambda * weight); ```, ```gradin = dot(gradout, netweight.T());``` +* Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called * Lightweight library: light amount of code to support frequently used functions in machine learning * Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required. diff --git a/guide/README.md b/guide/README.md new file mode 100644 index 000000000000..412182955b06 --- /dev/null +++ b/guide/README.md @@ -0,0 +1,217 @@ +Tutorial of mshadow +===== +This is a beginner's tutorial of mshadow. If you like mshadow and have ideas to improve this tutorial, you are more than welcomed:) +Please send a pull-request if you would like to share your experience, + +**List of Topics** +* [Tensor Data Structure](#tensor-data-structure) +* [Memory Allocation](#memory-allocation) +* [Elementwise Operations](#elementwise-operations) +* [One code for both CPU and GPU](#one-code-for-both-cpu-and-gpu) +* [Matrix Multiplications](#matrix-multiplications) +* [User Defined Operator](#user-defined-operator) + +Tensor Data Structure +==== +The basic data structure of mshadow is Tensor. The following is a simplified equivalent version of +the declaration in [mashadow/tensor.h](../mshadow/tensor.h) +```c++ +typedef unsigned index_t; +template +struct Shape { + index_t shape_[dimension]; +}; +template +struct Tensor { + DType *dptr_; + Shape shape_; + index_t stride_; +}; +// this is how shape object declaration look like +Shape<2> shape2; +// this is how tensor object declaration look like +// you can +Tensor ts2; +Tensor ts3; +``` +``` Tensor``` means a two dimensional tensor in CPU, while ``` Tensor``` means three dimensional tensor in GPU. +```Shape``` gives the shape information of k-dimensional tensor. The declaration use template, and +can be specialized into tensor of specific device and dimension. This is what two dimensional tensor will look like: +```c++ +struct Shape<2> { + index_t shape_[2]; +}; +struct Tensor { + float *dptr_; + Shape<2> shape_; + index_t stride_; +}; +``` +* ``` Tensor``` contains ```dptr_```, which points to the space that backup the tensor. +* ```Shape<2>``` is a structure that stores shape information, the convention is same as numpy +* ```stride_``` gives the number of cell space allocated in the smallest dimension (if we use numpy convention, the dimension corresponds to shape_[-1]). + This is introduced when we introduce some padding cells in lowest dimension to make sure memory is aligned. + - ```stride_``` is automatically set during memory allocation of tensor in mshadow. + +To understand the data structure, consider the following code: +``` c++ +float data[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; +Tensor ts; +ts.dptr_ = data; +ts.shape_ = mshadow::Shape2(3, 2); +ts.stride_ = 3; +// now: ts[0][0] == 0, ts[0][1] == 1 , ts[1][0] == 3, ts[1][1] == 4 +for (index_t i = 0; i < ts.size(0); ++i) { + for (index_t j = 0; j < ts.size(1), ++j) { + printf("ts[%u][%u]=%f\n", i, j, ts[i][j]); + } +} +``` +The result ts should be a 3 * 2 matrix, where data[2], data[5], data[8] are padding cells that are ignored. If you want a continuous memory, set ```stride_=shape_[1]```. + +Memory Allocation +==== +An important design choice about mshadow is that the data structure is a **whitebox**: +it works so long as we set the space pointer ```dptr_```, corresponding ```shape_``` and ```stride_```: +* For ```Tensor```, the space can be created by ```new float[]```, or pointer to some existing space such as float array in last example. +* For ```Tensor```, the space need to lie in GPU, created by ```cudaMallocPitch``` + +mshadow also provide explicit memory allocation routine, demonstrated shown by following code +``` c++ +// create a 5 x 3 tensor on GPU, and allocate space +Tensor ts2(Shape2(5, 3)); +AllocSpace(&ts2); +// allocate 5 x 3 x 2 tensor on CPU, initialized by 0 +Tensor ts3 = NewTensor(Shape3(5,3,2), 0.0f); +// free space +FreeSpace(&ts2); FreeSpace(&ts3); +``` +All memory allocations in mshadow are **explicit**. There is **no** implicit memory allocation and de-allocation during any operations. +This means ```Tensor``` variable is more like a reference handle(pointer), instead of a object. If we assign a tensor to another variable, the two share the same content space. + +This also allows user to use mshadow in their existing project easily, simply give mshadow the pointer of the memory and you can get the benefit of all the mshadow expressions with zero cost:) + +Elementwise Operations +==== +All the operators(+, -, *, /, += etc.) in mshadow are element-wise. Consider the following SGD update code: +```c++ +void UpdateSGD(Tensor weight, Tensor grad, float eta, float lambda) { + weight -= eta * (grad + lambda * weight); +} +``` +During compilation, this code will be translated to the following form: +```c++ +void UpdateSGD(Tensor weight, Tensor grad, float eta, float lambda) { + for (index_t y = 0; y < weight.size(0); ++y) { + for (index_t x = 0; x < weight.size(1); ++x) { + weight[y][x] -= eta * (grad[y][x] + lambda * weight[y][x]); + } + } +} +``` +As we can see, *no memory allocation* is happened in the translated code. For ```Tensor```, the corresponding function will be translated into a CUDA kernel of same spirit. +Using [Expression Template](exp-template), the translation is happened during compile time. We can write simple lines of code while get the full performance of the translated code. + +One code for both CPU and GPU +==== +Since mshadow have identical interface for ```Tensor``` and ```Tensor```, we can easily write one code that works in both CPU and GPU. +For example, the following code compiles for both GPU and CPU Tensors. +```c++ +template +void UpdateSGD(Tensor weight, const Tensor &grad, + float eta, float lambda) { + weight -= eta * (grad + lambda * weight); +} +``` +Matrix Multiplications +==== +We also have short hands for dot product, as like follows. The code will be translated to call standard packages such as MKL and CuBLAS. +```c++ +template +void Backprop(Tensor gradin, + const Tensor &gradout, + const Tensor &netweight) { + gradin = dot(gradout, netweight.T()); +} +``` +Again, the code can compile for both GPU and CPU Tensors + +User Defined Operator +==== +There are common cases when we want to define our own function. For example, assume we do not have element-wise sigmoid transformation in mshadow, +which is very commonly used in machine learning algorithms. We simply use the following code to add sigmoid to mshadow +```c++ +struct sigmoid { + MSHADOW_XINLINE static float Map(float a) { + return 1.0f / (1.0f + expf(-a)); + } +}; +template +void ExampleSigmoid(Tensor out, const Tensor &in) { + out = F(in * 2.0f) + 1.0f; +} +``` +The equivalent translated code for CPU is given by +```c++ +template +void ExampleSigmoid(Tensor out, const Tensor &in) { + for (index_t y = 0; y < out.size(0); ++y) { + for(index_t x = 0; x < out.size(1); ++x) { + out[y][x] = sigmoid::Map(in[y][x] * 2.0f) + 1.0f; + } + } +} +``` +Also note that the defined operation can be **composited into expressions**, not only we can write ```out = F(in)```, +we can also write ```out = F+2.0``` or ```out = F(F(in))```. + +There will also be a translated CUDA kernel version that runs in GPU. Check out [defop.cpp](defop.cpp) for complete example. + +Complete Example +==== +The following code is from [basic.cpp](basic.cpp), that illustrate basic usage of mshadow. + +```c++ +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // assume we have a float space + float data[20]; + // create a 2 x 5 x 2 tensor, from existing space + Tensor ts(data, Shape3(2,5,2)); + // take first subscript of the tensor + Tensor mat = ts[0]; + // Tensor object is only a handle, assignment means they have same data content + // we can specify content type of a Tensor, if not specified, it is float bydefault + Tensor mat2 = mat; + + // shaape of matrix, note size order is same as numpy + printf("%u X %u matrix\n", mat.size(1), mat.size(1)); + + // initialize all element to zero + mat = 0.0f; + // assign some values + mat[0][1] = 1.0f; mat[1][0] = 2.0f; + // elementwise operations + mat += (mat + 10.0f) / 10.0f + 2.0f; + + // print out matrix, note: mat2 and mat1 are handles(pointers) + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { + printf("%.2f ", mat2[i][j]); + } + printf("\n"); + } + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} +``` + diff --git a/make/README.md b/make/README.md index 8fa257564b35..6ef24d6d467c 100644 --- a/make/README.md +++ b/make/README.md @@ -1,10 +1,11 @@ Makefile Configuration of MShadow ===== -This folder contains Makefile configuration of mshadow. MShadow is a template library, -you only need to include mshadow to use it. +MShadow is a template library, you only need to include mshadow to use it. So this folder is not used to build mshadow library file. -You can compile mshadow with different mode, for example, with or without CUDA. There are different compile flags -that you might need to set in your own configuration, and this folder provides a Makefile script to help you do that. +However, mshadow is a flexible library that allows you to compile with different configurations. For example, +you can compile mshadow without CUDA, and specify your own choice of BLAS. +There are different compile flags that you might need to set in your own configuration. +This folder provides a Makefile script to help you do that. Usage ===== From a8864f8072999240e436c44737940ef7bfdbcfdc Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 18:04:29 -0800 Subject: [PATCH 131/147] slight change --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8f1cf327994e..88d54240b635 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,11 @@ MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The Features ===== -* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code in ***compile time***. +* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code - No temporal memory allocation will happen for expression you write - mshadow will generate specific kernel for every expression you write in compile time. -* Device invariant: you can write +* Device invariant: you can write one code and it will run on both CPU and GPU * Simple: mshadow allows you to write machine learning code using expressions. - - For example: ```weight = -eta * (grad + lambda * weight); ```, ```gradin = dot(gradout, netweight.T());``` * Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called * Lightweight library: light amount of code to support frequently used functions in machine learning * Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required. From fb4549b34645c36ab2cf48dce02190b2b0926223 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 18:29:03 -0800 Subject: [PATCH 132/147] add guide of exp template --- guide/README.md | 5 +- guide/exp-template/.gitignore | 1 + guide/exp-template/README.md | 342 ++++++++++++++++++++++++- guide/exp-template/exp_lazy.cpp | 59 +++-- guide/exp-template/exp_template.cpp | 98 +++---- guide/exp-template/exp_template_op.cpp | 114 +++++---- 6 files changed, 490 insertions(+), 129 deletions(-) create mode 100644 guide/exp-template/.gitignore diff --git a/guide/README.md b/guide/README.md index 412182955b06..5f539eb66da8 100644 --- a/guide/README.md +++ b/guide/README.md @@ -1,7 +1,10 @@ Tutorial of mshadow ===== This is a beginner's tutorial of mshadow. If you like mshadow and have ideas to improve this tutorial, you are more than welcomed:) -Please send a pull-request if you would like to share your experience, +Please send a pull-request if you would like to share your experience. + + +See also [Expression Template Tutorial][exp-template] **List of Topics** * [Tensor Data Structure](#tensor-data-structure) diff --git a/guide/exp-template/.gitignore b/guide/exp-template/.gitignore new file mode 100644 index 000000000000..fc070ad5bd7e --- /dev/null +++ b/guide/exp-template/.gitignore @@ -0,0 +1 @@ +exp_* \ No newline at end of file diff --git a/guide/exp-template/README.md b/guide/exp-template/README.md index 8c30a2998c2a..f5b2668ab2fc 100644 --- a/guide/exp-template/README.md +++ b/guide/exp-template/README.md @@ -1,4 +1,340 @@ -This folder is not example of mshadow code. -It is example code introducing expression template, the trick behind mshadow. +Expression Template Tutorial +==== +This page explains how mshadow works. The main trick behind mshadow is called [Expression Template](http://en.wikipedia.org/wiki/Expression_templates). +We will explain how it will affect the performance of compiled code. Expression template is the major trick behind the C++ matrix libraries such as Eigen, GSL, boost.uBLAS. + +How to write efficient update rule for machine learning +==== +Before we start, let us think of the question above. Assume we want to write down the update rule +```c++ +weight = - eta * (grad + lambda * weight); +``` +Where weight and grad are vectors of length ```n```. When you choose C++ as your programming language, +I guess the major concern is efficiency. There is one principle that is important and used in most C/C++ programs: +* Pre-allocate necessary memory, **no temporal memory allocation** during running. + +An example code is like +```c++ +void UpdateWeight (const float *grad, float eta, float lambda, + int n, float *weight) { + for (int i = 0; i < n; ++i) { + weight[i] = - eta * (grad[i] + lambda * weight[i]); + } +} +``` +The function takes the pre-allocated space grad, and weight, and run the calculation. Writing these functions are simple, +however, it can be annoying when we write them repeatedly. So the question is, can we write as follows, and get same performance as previous code? +```c++ +void UpdateWeight (const Vec& grad, float eta, float lambda, Vec& weight) { + weight = -eta * (grad + lambda * weight); +} +``` +The answer is yes, but not by the most obvious solution. + +A Naive Bad Solution +==== +Let us first take a look at a most straight forward solution: operator overloading. +```c++ +// Naive solution for vector operation overloading +struct Vec { + int len; + float* dptr; + Vec(int len) : len(len) { + dptr = new float[len]; + } + Vec(const Vec& src) : len(src.len) { + dptr = new float[len]; + memcpy(dptr, src.dptr, sizeof(float)*len ); + } + ~Vec(void) { + delete [] dptr; + } +}; + +inline Vec operator+(const Vec &lhs, const Vec &rhs) { + Vec res(lhs.len); + for (int i = 0; i < lhs.len; ++i) { + res.dptr[i] = lhs.dptr[i] + rhs.dptr[i]; + } + return res; +} +``` +If we add more operators overloading in the same style, we can get what we want, and write equations instead of loop. +However, this kind of approach is inefficient, because temporal memory is allocated and de-allocated during each operation, while we could have done better. + +An alternative, more effective way is only overload operator+=, operator-=, which can be implemented without temporal memory allocation. But this limits the equations we can write. + +We will discuss why we still need expression template although C++11 provides move assignment operator and rvalue reference at the end of this tutorial. + +Lazy Evaluation +==== +Let us think why we need temporal memory allocation when doing operator+. This is because we *do not know* the target that will be assigned to in operator+, +otherwise we could have directly storing into target memory instead of temporal memory. + +What if we can know the target? The following code ([exp_lazy.cpp](exp_lazy.cpp)) achieves this. +```c++ +// Example Lazy evaluation code +// for simplicity, we use struct and make all members public +#include +struct Vec; +// expression structure holds the expression +struct BinaryAddExp { + const Vec &lhs; + const Vec &rhs; + BinaryAddExp(const Vec &lhs, const Vec &rhs) + : lhs(lhs), rhs(rhs) {} +}; +// no constructor and destructor to allocate and de-allocate memory, +// allocation done by user +struct Vec { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + inline Vec &operator=(const BinaryAddExp &src) { + for (int i = 0; i < len; ++i) { + dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i]; + } + return *this; + } +}; +// no evaluation happens here +inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) { + return BinaryAddExp(lhs, rhs); +} + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression + A = B + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]); + } + return 0; +} +``` +The idea is that we do not actually do computation in operator+, but only return a expression structure (like abstract syntax tree), +and when we overload operator=, we see the target, as well as all the operands, and we can run computation without introducing extra memory! +Similarly, we can define a DotExp and lazily evaluate at operator=, and redirect matrix(vector) multiplications to BLAS. + + +More Lengthy Expressions and Expression Template +==== +By using lazy evaluation, we are cool by avoiding temporal memory allocations. But the ability of the code is limited: +* We can only write ```A=B+C```, but not more lengthy expressions. +* When we add more expression, we need to write more operator= to evaluate each equations. + +Here is where the magic of template programming comes to rescue. The following code ([exp_template.cpp](exp_template.cpp)), +which is a bit more lengthy, also allows you to write lengthy equations. +```c++ +// Example code, expression template, and more length equations +// for simplicity, we use struct and make all members public +#include + +// this is expression, all expressions must inheritate it, +// and put their type in subtype +template +struct Exp { + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } +}; + +// binary add expression +// note how it is inheritates from Exp +// and put its own type into the template argument +template +struct BinaryAddExp: public Exp > { + const TLhs &lhs; + const TRhs &rhs; + BinaryAddExp(const TLhs& lhs, const TRhs& rhs) + : lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return lhs.Eval(i) + rhs.Eval(i); + } +}; +// no constructor and destructor to allocate +// and de-allocate memory, allocation done by user +struct Vec: public Exp { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + :len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator= (const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); + } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } +}; +// template add, works for any expressions +template +inline BinaryAddExp +operator+(const Exp &lhs, const Exp &rhs) { + return BinaryAddExp(lhs.self(), rhs.self()); +} + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B + C + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f + %f + %f\n", i, + A.dptr[i], B.dptr[i], + C.dptr[i], C.dptr[i]); + } + return 0; +} +``` +The key idea of the code is the template ```Exp``` takes type of its derived class as template argument, so it can convert itself to +the SubType via ```self()```. BinaryAddExp now is a template class that can composite expressions together, like a template version of Composite pattern. +The evaluation is done through function Eval, which is done in a recursive way in BinaryAddExp. +* Due to inlining, the function calls of ```src.Eval(i)``` in ```operator=``` will be compiled into ```B.dptr[i] + C.dptr[i] + C.dptr[i]``` in compile time. +* We can write equations for element-wise operations with same efficiency as if we write a loop + +Make it more flexible +==== +As we can find in the previous example, template programming is a powerful to make things flexible in compile time, our final example, +which is closer to mshadow, allows user customized binary operators ([exp_template_op.cpp](exp_template_op.cpp)). +```c++ +// Example code, expression template +// with binary operator definition and extension +// for simplicity, we use struct and make all members public +#include + +// this is expression, all expressions must inheritate it, +// and put their type in subtype +template +struct Exp{ + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } +}; + +// binary operators +struct mul{ + inline static float Map(float a, float b) { + return a * b; + } +}; + +// binary add expression +// note how it is inheritates from Exp +// and put its own type into the template argument +template +struct BinaryMapExp: public Exp >{ + const TLhs& lhs; + const TRhs& rhs; + BinaryMapExp(const TLhs& lhs, const TRhs& rhs) + :lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return OP::Map(lhs.Eval(i), rhs.Eval(i)); + } +}; +// no constructor and destructor to allocate and de-allocate memory +// allocation done by user +struct Vec: public Exp{ + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator=(const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); + } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } +}; +// template add, works for any expressions +template +inline BinaryMapExp +F(const Exp& lhs, const Exp& rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); +} + +template +inline BinaryMapExp +operator*(const Exp& lhs, const Exp& rhs) { + return F(lhs, rhs); +} + +// user defined operation +struct maximum{ + inline static float Map(float a, float b) { + return a > b ? a : b; + } +}; + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B * F(C, B); + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f * max(%f, %f)\n", + i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]); + } + return 0; +} +``` + +Summary +===== +Up to this point, you should have understand basic ideas how it works: +* Lazy evaluation, to allow us see all the operands and target +* Template composition and recursive evaluation, to allows us evaluate arbitrary composite expressions for element-wise operations. +* Due to template and inlining, writing expressions are as efficient as if we directly write a for loop to implement the update rule:) + +So write expressions when you write machine learning codes, and focus your energy on the algorithm part that matters. + +The Expression Template in MShadow +===== +Expression template in mshadow use the same key points as we introduced in the tutorial, with some minor differences: +* We separate evaluation code from expression construction and composition code. + - Instead of putting Eval in Exp class. A Plan class is created from expression, and used to evaluate the result. + - This allows us to put less variables in Plan, for example, we do not need array length when we evaluate a data. + - One important reason is CUDA kernel cannot take class with const references + - This design choice is debatable, but we find it is useful so far. +* Lazy support for complex expressions such as matrix dot product + - Besides element-wise expressions, we also want to support sugars such as ```A = dot(B.T(), C)```, again, lazy evaluation is used and no extra memory is allocated. +* Type checking and array length checking. + +Notes +==== +* Expression Template and C++11: in C++11, move constructor can be used to save repetitive allocation memory, which removes some need to expression template. However, the space still needs to be allocated at least once. + - This only removes the need of expression template then expression generate space, say dst = A+B+C, dst does not contain space allocated before assignment. + - If we want to keep the syntax that everything is pre-allocated, and expression executes without memory allocation (which is what we did in mshadow), we still need expression template. -See: https://github.com/tqchen/mshadow/wiki/Expression-Template diff --git a/guide/exp-template/exp_lazy.cpp b/guide/exp-template/exp_lazy.cpp index 91f49b4fca78..4e6a6b14b9de 100644 --- a/guide/exp-template/exp_lazy.cpp +++ b/guide/exp-template/exp_lazy.cpp @@ -3,38 +3,43 @@ #include struct Vec; // expression structure holds the expression -struct BinaryAddExp{ - const Vec& lhs; - const Vec& rhs; - BinaryAddExp(const Vec& lhs, const Vec& rhs):lhs(lhs),rhs(rhs){} +struct BinaryAddExp { + const Vec &lhs; + const Vec &rhs; + BinaryAddExp(const Vec &lhs, const Vec &rhs) + : lhs(lhs), rhs(rhs) {} }; -// no constructor and destructor to allocate and de-allocate memory, allocation done by user +// no constructor and destructor to allocate and de-allocate memory, +// allocation done by user struct Vec { - int len; - float* dptr; - Vec (void){} - Vec (float *dptr, int len):len(len),dptr(dptr){} - // here is where evaluation happens - inline Vec& operator= (const BinaryAddExp& src){ - for( int i = 0; i < len; ++i ){ - dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i]; - } - return *this; + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + inline Vec &operator=(const BinaryAddExp &src) { + for (int i = 0; i < len; ++i) { + dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i]; } + return *this; + } }; // no evaluation happens here -inline BinaryAddExp operator+ (const Vec& lhs, const Vec& rhs){ - return BinaryAddExp(lhs, rhs); +inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) { + return BinaryAddExp(lhs, rhs); } -const int n = 3; -int main( void ){ - float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5}; - Vec A(sa,n), B(sb,n), C(sc,n); - // run expression - A = B + C; - for( int i = 0; i < n; ++ i ){ - printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i] ); - } - return 0; +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression + A = B + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]); + } + return 0; } diff --git a/guide/exp-template/exp_template.cpp b/guide/exp-template/exp_template.cpp index d9ec4622f706..556b10316a3b 100644 --- a/guide/exp-template/exp_template.cpp +++ b/guide/exp-template/exp_template.cpp @@ -1,64 +1,72 @@ // Example code, expression template, and more length equations // for simplicity, we use struct and make all members public - #include -// this is expression, all expressions must inheritate it, and put their type in subtype +// this is expression, all expressions must inheritate it, +// and put their type in subtype template -struct Exp{ - // returns const reference of the actual type of this expression - inline const SubType& self(void) const{ - return *static_cast(this); - } +struct Exp { + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } }; // binary add expression // note how it is inheritates from Exp // and put its own type into the template argument template -struct BinaryAddExp: public Exp< BinaryAddExp >{ - const TLhs& lhs; - const TRhs& rhs; - BinaryAddExp(const TLhs& lhs, const TRhs& rhs):lhs(lhs),rhs(rhs){} - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return lhs.Eval(i) + rhs.Eval(i); - } +struct BinaryAddExp: public Exp > { + const TLhs &lhs; + const TRhs &rhs; + BinaryAddExp(const TLhs& lhs, const TRhs& rhs) + : lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return lhs.Eval(i) + rhs.Eval(i); + } }; -// no constructor and destructor to allocate and de-allocate memory, allocation done by user -struct Vec: public Exp{ - int len; - float* dptr; - Vec (void){} - Vec (float *dptr, int len):len(len),dptr(dptr){} - // here is where evaluation happens - template - inline Vec& operator= (const Exp& src_){ - const EType &src = src_.self(); - for( int i=0; i < len; ++i ){ - dptr[i] = src.Eval(i); - } - return *this; - } - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return dptr[i]; +// no constructor and destructor to allocate +// and de-allocate memory, allocation done by user +struct Vec: public Exp { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + :len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator= (const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } }; // template add, works for any expressions template -inline BinaryAddExp operator+ (const Exp& lhs, const Exp& rhs){ - return BinaryAddExp(lhs.self(), rhs.self()); +inline BinaryAddExp +operator+(const Exp &lhs, const Exp &rhs) { + return BinaryAddExp(lhs.self(), rhs.self()); } -const int n = 3; -int main( void ){ - float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5}; - Vec A(sa,n), B(sb,n), C(sc,n); - // run expression, this expression is longer:) - A = B + C + C; - for( int i = 0; i < n; ++ i ){ - printf("%d:%f==%f+%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i], C.dptr[i] ); - } - return 0; +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B + C + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f + %f + %f\n", i, + A.dptr[i], B.dptr[i], + C.dptr[i], C.dptr[i]); + } + return 0; } diff --git a/guide/exp-template/exp_template_op.cpp b/guide/exp-template/exp_template_op.cpp index 4399936b6981..249b181ada5b 100644 --- a/guide/exp-template/exp_template_op.cpp +++ b/guide/exp-template/exp_template_op.cpp @@ -1,84 +1,92 @@ // Example code, expression template // with binary operator definition and extension // for simplicity, we use struct and make all members public - #include -// this is expression, all expressions must inheritate it, and put their type in subtype +// this is expression, all expressions must inheritate it, +// and put their type in subtype template struct Exp{ - // returns const reference of the actual type of this expression - inline const SubType& self(void) const{ - return *static_cast(this); - } + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } }; // binary operators struct mul{ - inline static float Map(float a, float b){ - return a * b; - } + inline static float Map(float a, float b) { + return a * b; + } }; // binary add expression // note how it is inheritates from Exp // and put its own type into the template argument -template -struct BinaryMapExp: public Exp< BinaryMapExp >{ - const TLhs& lhs; - const TRhs& rhs; - BinaryMapExp(const TLhs& lhs, const TRhs& rhs):lhs(lhs),rhs(rhs){} - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return OP::Map( lhs.Eval(i), rhs.Eval(i) ); - } +template +struct BinaryMapExp: public Exp >{ + const TLhs& lhs; + const TRhs& rhs; + BinaryMapExp(const TLhs& lhs, const TRhs& rhs) + :lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return OP::Map(lhs.Eval(i), rhs.Eval(i)); + } }; -// no constructor and destructor to allocate and de-allocate memory, allocation done by user +// no constructor and destructor to allocate and de-allocate memory +// allocation done by user struct Vec: public Exp{ - int len; - float* dptr; - Vec (void){} - Vec (float *dptr, int len):len(len),dptr(dptr){} - // here is where evaluation happens - template - inline Vec& operator= (const Exp& src_){ - const EType &src = src_.self(); - for( int i=0; i < len; ++i ){ - dptr[i] = src.Eval(i); - } - return *this; - } - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return dptr[i]; + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator=(const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } }; // template add, works for any expressions -template -inline BinaryMapExp F(const Exp& lhs, const Exp& rhs){ - return BinaryMapExp(lhs.self(), rhs.self()); +template +inline BinaryMapExp +F(const Exp& lhs, const Exp& rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); } template -inline BinaryMapExp operator* (const Exp& lhs, const Exp& rhs){ - return F(lhs, rhs); +inline BinaryMapExp +operator*(const Exp& lhs, const Exp& rhs) { + return F(lhs, rhs); } // user defined operation struct maximum{ - inline static float Map(float a, float b){ - return a > b ? a : b; - } + inline static float Map(float a, float b) { + return a > b ? a : b; + } }; -const int n = 3; -int main( void ){ - float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5}; - Vec A(sa,n), B(sb,n), C(sc,n); - // run expression, this expression is longer:) - A = B * F(C, B); - for( int i = 0; i < n; ++ i ){ - printf("%d:%f==%f*max(%f,%f)\n", i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i] ); - } - return 0; +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B * F(C, B); + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f * max(%f, %f)\n", + i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]); + } + return 0; } From ea08286d746c3f98997cf3c155513b32bdf0f634 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 18:30:20 -0800 Subject: [PATCH 133/147] slight chg --- guide/exp-template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guide/exp-template/README.md b/guide/exp-template/README.md index f5b2668ab2fc..c824d8e4e3c6 100644 --- a/guide/exp-template/README.md +++ b/guide/exp-template/README.md @@ -3,7 +3,7 @@ Expression Template Tutorial This page explains how mshadow works. The main trick behind mshadow is called [Expression Template](http://en.wikipedia.org/wiki/Expression_templates). We will explain how it will affect the performance of compiled code. Expression template is the major trick behind the C++ matrix libraries such as Eigen, GSL, boost.uBLAS. -How to write efficient update rule for machine learning +How to write efficient machine learning code ==== Before we start, let us think of the question above. Assume we want to write down the update rule ```c++ From 3ab03e2795cb28655d679894a7084fd3c446353e Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 25 Jan 2015 20:15:37 -0800 Subject: [PATCH 134/147] add more doc --- README.md | 1 + doc/README.md | 320 ++++++++++++++++++++++++++++++++++++++++++++++ mshadow/README.md | 8 ++ 3 files changed, 329 insertions(+) create mode 100644 doc/README.md create mode 100644 mshadow/README.md diff --git a/README.md b/README.md index 88d54240b635..3310efc12cdb 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The * [Contributors](https://github.com/tqchen/mshadow/graphs/contributors) * [Tutorial](guide) +* [Documentation](doc) Features ===== diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 000000000000..3502b3e5d351 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,320 @@ +MShadow Documentation +===== +This is the documentation for mshadow: A Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. + +### Links to Topics + +* [Tutorial](../guide) +* API Documentation + - You can run ```./mkdoc.sh``` to make the document locally +* [Tutorial about Expression Template](../guide/exp-template) +* [Compile Configuration script](../make) +* [Expression API](#expression-api) + - Expression api introduces the concept of expression in mshadow + +Expression API +===== +Expression is the key concept in mshadow, a common operation of mshadow is ```tensor = some code to construct expression``` + +There are three major types of expression: +* Mapper expression: only contain element-wise operations of Mapper expressions + - Mapper expression can used as composition component of other operations. + - Tensor, scalar are Mapper expressions + - Example: ``` weight = - eta * (grad + lambda * weight)``` is a Mapper expression. + - Mapper expressions are translated using expression template code implemented by mshadow. + - ***Assign safety***: Element-wise mapping are assign safe, which means, we can write ```A = A * 2 + B```, making lvalue appear in expression, the results are still correct. +* Chainer expression: may contain element-wise operation such as reduction and broadcast + - Example: ```dst = mirror(src)``` is a chainer expression + - ***Assign safety***: Most of the chainer extensions are not assignment safe, which means user should avoid putting target in source epression. +* Complex expression: complex operations, need special translation rule to translate to specific implementations. + - Complex expression can not be used as composition component of other operations. + - Example: ``` dot(lhs.T(), rhs)```, is complex expression, we can not write +``` dst = 1.0 + dot(lhs.T(), rhs)``` + - But limited syntax is supported depending on specification, for example, we do support ``` dst += 2.0f * dot(lhs.T(), rhs)``` + - Complex expressions are translated into specific implementations such as BLAS. + +### Element-wise Operations +The basic binary operators are overloaded to composite Mapper expressions, so we can write +```c++ +weight = (-eta) * (grad + lambda * weight); +``` +We can also use customized binary operators, and unary operators: +```c++ +struct maximum { + MSHADOW_XINLINE static float Map(float a, float b) { + return a > b ? a : b; + } +}; +template +void ExampleMaximum(Tensor out, + const Tensor &A, + const Tensor &B) { + out= 10.0f * F(A+1.0f, B); +} +struct sigmoid { + MSHADOW_XINLINE static float Map(float a) { + return 1.0f/(1.0f+expf(-a)); + } +}; +template +void ExampleSigmoid(Tensor out, const Tensor &in) { + // equivalent to out = sigmoid(in*2) + 1; + out = F(F(in * 2.0f), ScalarExp(1.0f)); +} +``` +### Matrix Multiplications +Matrix multiplications are supported by following syntax, with things brackets [] are optional +``` +dst [scale*] dot(lhs [.T()] , rhs [.T()]), can be =,+=,-= +``` +Example: +```c++ +template +void Backprop(Tensor gradin, + const Tensor &gradout, + const Tensor &netweight) { + gradin = 2.0 * dot(gradout, netweight.T()); +} +``` + +### Introducing Expression Extensions +Naming conventions: +* ```Tensor``` to refer to any Tensor with device any device and dimension. +* ```xpu```, ```dim```, are implicit template parameters. +* ```Expr``` will be used to refer to any mapper expression with type ```Tensor```. + +List of functions: +* [reshape](#reshape): reshapes a tensor to another shape, number of content must be same +* [broadcast](#broadcast): replicate a 1 dimension tensor in certain dimension +* [repmat](#repmat), special case of broadcast<0>: repeat vector over rows to form a matrix +* [sumall_except_dim](#sumall_except_dim): sum over all the dimensions, except the dimension specified in template parameter +* [sum_rows](#sum_rows): special case of sumall_except_dim<0>, sum of rows in the matrix +* [unpack_patch2col](#unpack_patch2col): unpack local (overlap) patches of image to column of mat, can be used to implement convolution +* [pack_col2patch](#pack_col2patch): reverse operation of unpack_patch2col, can be used to implement deconvolution +* [pool](#pool): do pooling on image +* [unpool](#unpool): get gradient of pooling result +* [crop](#crop): crop the original image to a smaller size +* [mirror](#mirror): get the mirrored result of input expression + +====== +##### reshape +* ```reshape(Expr src, Shape oshape)``` +* reshapes a tensor to another shape, total number of elements must be same +* parameters: + - src: input data + - oshape: target shape +* result expression type: ```Tensor``` with ```shape=oshape```, is Mapper expression +```c++ +void ExampleReshape(void) { + Tensor dst = NewTensor(Shape2(4, 5)); + Tensor src = NewTensor(Shape1(20), 1.0f); + dst = reshape(src, dst.shape_); + ... +} +``` +====== + +##### broadcast +* ```broadcast(Tensor src, Shape oshape)``` +* replicate a 1 dimension tensor certain dimension, specified by template parameter dimcast +* parameters: + - src: input 1 dimensional tensor + - oshape: shape of output +* return expression type: ```Tensor```, ```shape = oshape```, is Chainer expression +```c++ +void ExampleBroadcast(void) { + Tensor dst = NewTensor(Shape2(2, 3)); + Tensor src = NewTensor(Shape1(2), 1.0f); + src[0] = 2.0f; src[1] = 1.0f; + dst = broadcast<0>(src, dst.shape_); + // dst[0][0] = 2, dst[0][1] = 2; dst[1][0]=1, dst[1][1] = 1 + ... +} +``` +====== +##### repmat +* ```repmat(Tensor src, int nrows) ```` +* special case of broadcast, repeat 1d tensor over rows +* input parameters: + - src: input vector + - nrows: number of rows in target +* return expression type: ```Tensor```, with ```shape=(nrows, src.size(0))```, is Chainer expression +```c++ +void ExampleRepmat(void) { + Tensor dst = NewTensor(Shape2(3, 2)); + Tensor src = NewTensor(Shape1(2), 1.0f); + src[0] = 2.0f; src[1] = 1.0f; + dst = repmat(src, 3); + // dst[0][0] = 2, dst[0][1] = 1; dst[1][0]=2, dst[1][1] = 1 + ... +} +``` +====== +##### sumall_except_dim +* ```sumall_except_dim(Expr src) ```` +* sum over all dimensions, except dimkeep +* input parameters: + - src: input mapper expression +* return expression type: ```Tensor```, with ```shape=(src.size(dimkeep))```, is Complex expression +* Syntax: ```dst [sv] [scale*] sumall_except_dim(src) , can be =, +=, -=, *=, /=```` +```c++ +void ExampleSumAllExceptDim(void) { + Tensor src = NewTensor(Shape3(2, 3, 2), 1.0f); + Tensor dst = NewTensor(Shape1(3), 1.0f); + dst += sum_all_except<1>(src * 2.0f); + // dst[0] = 1.0 + 4.0 *2.0 = 9.0 + ... +} +``` +====== +##### sum_rows +* ```sum_rows(Expr src) ```` +* sum of rows in the matrix +* input parameters: + - src: input mapper expression +* return expression type: ```Tensor```, with ```shape=(src.size(0))```, is Complex expression +* Syntax: ```dst [sv] [scale*] sum_rows(src) , can be =,+=,-=,*=,/=```` +```c++ +void ExampleSumRows(void) { + Tensor src = NewTensor(Shape2(3, 2), 1.0f); + Tensor dst = NewTensor(Shape1(2), 1.0f); + dst += sum_rows(src + 1.0f); + // dst[0] = 1.0 + 3.0 *(1.0+1.0) = 7.0 + ... +} +``` +====== +##### unpack_patch2col +* ```unpack_patch2col(Expr img, int psize_y, int p_size_x, int pstride) ```` +* unpack local (overlap) patches of image to column of mat, can be used to implement convolution, after getting unpacked mat, we can use: ```output = dot(weight, mat)``` to get covolved results, the relations: + - weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x + - output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images + - out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0 + - out_width = (in_width - psize_x) / pstride + 1 +* input parameters: + - img: source image, can be expression; (in_channels, in_height, in_width) + - psize_y height of each patch + - psize_x width of each patch + - pstride: stride of each patch +* return expression type: ```Tensor```, with ```shape=(in_channel*psize*psize, out_height*out_width)```, is Chainer expression +```c++ +void ExampleCovolution(Tensor dst, Tensor src, + Tensor weight, int ksize, int stride) { + int o_height = (src.size(1)- ksize) / stride + 1; + int o_width = (src.size(2)- ksize) / stride + 1; + utils::Assert(weight.size(0) == src.size(0) * ksize * ksize); + TensorContainer tmp_col(Shape2(src.size(0) * ksize * ksize, + o_height * o_width)); + TensorContainer tmp_dst(Shape2(weight.size(0), + o_height * o_width)); + tmp_col = unpack_patch2col(src, ksize, ksize, stride); + tmp_dst = dot(weight, tmp_col); + dst = reshape(tmp_dst, dst.shape_); +} +``` + +====== +##### pack_col2patch +* ```pack_col2patch(Tensor mat, Shape<3> imshape, int psize_y, int psize_x, int pstride) ```` +* reverse operation of unpack_patch2col, can be used to implement deconvolution +* input parameters: + - mat: source mat, same shape as output of unpack_patch2col + - imshape: shape of target image + - psize_y height of each patch + - psize_x width of each patch + - pstride: stride of each patch +* return expression type: ```Tensor```, with ```shape = imshape```, is Chainer expression +```c++ +void ExampleDecovolution(Tensor bottom, Tensor top, + Tensor weight, int ksize, int stride) { + int o_height = (bottom.size(1)- ksize) / stride + 1; + int o_width = (bottom.size(2)- ksize) / stride + 1; + utils::Assert(weight.size(0) == bottom.size(0) * ksize * ksize); + TensorContainer tmp_col(Shape2(bottom.size(0) * ksize * ksize, + o_height * o_width)); + TensorContainer tmp_dst(Shape2(weight.size(0), o_height*o_width)); + tmp_dst = reshape(top, tmp_dst.shape_); + tmp_col = dot(weight.T(), tmp_dst); + bottom = pack_col2patch(tmp_col, bottom.shape_, ksize, ksize, stride); +} +``` + +====== +##### pool +* ```pool(Expr img, [Shape<2> pshape,] int ksize_y, int ksize_x, int kstride)``` +* Pooling on image with specify kernel size and stride, can be used to implement max pooilng and other pooling layer +* input parameters: + - Reducer: operation can be max or sum + - img: source image, can be expression; (in_channels, in_height, in_width) + - [optional] Shape<2> pshape, output shape + - ksize_y height of each patch + - ksize_x width of each patch + - kstride: stride of each patch +* return expression: ```Expr```, with ```shape = (in_channel, (out_height - ksize) / kstride + 1, (out_width - ksize) / kstride + 1)```, or expression in pshape + - Chainer expression +```c++ +void ExampleMaxPooling(TensorContainer &data, int ksize, int stride) { + TensorContainer pooled(Shape3(data.size(0), + (data.size(2) - ksize) / kstride + 1), + (data.size(1) - ksize) / kstride + 1)); + pooled = pool(data, ksize, ksize, stride); +} +``` + +====== +##### unpool +* ```unpool(Tensor data_src, Tensor data_pooled, Tensor grad_pooled, int ksize_y, int ksize_x, int kstride)``` +* Unpooling on image with specify kernel size and stride, can be used to implement backprop of max pooilng and other pooling layer +* input parameters: + - Reducer: operation can be max or sum + - data_src: source image batch. + - data_pooled: pooled image batch. + - grad_pooled: gradient of upper layer + - ksize_y height of each patch + - ksize_x width of each patch + - kstride: stride of each patch +* return: + Expression, same shape to data_src +```c++ +void ExampleMaxUnpooling(Tensor &data_src, Tensor &data_pooled, + Tensor &grad_pooled, int ksize, int kstride) { + TensorContainer grad(data_src.shape_); + grad = unpool(data_src, data_pooled, + grad_pooled, ksize, ksize, kstride); +} +``` + +====== +##### crop +* ```crop(Expr src, Shape<2> oshape, int start_height, int start_width)``` +* input parameters: + - src: input expression + - oshape: output shape after crop + - start_height: start height for cropping + - start_width: start width for cropping +* Can also be ```crop(Expr src, Shape<2> oshape)``` where the crop will happen in center. +* return + - cropped expression +```c++ +void ExampleCrop(TensorContainer img, int start_height, int start_width) { + TensorContainer cropped(Shape3(img.size(0), + img.size(1) - start_height, + img.size(2) - start_width)); + cropped = crop(img, start_height, start_width); +} +``` + +====== +##### mirror +* ```mirrow(Expr src)``` +* input: + - src, source expression to be mirrored +* output: + - expression of mirrored result +```c++ +void ExampleMirror(TensorContainer img) { + TensorContainer mirrored(img.shape_); + mirrored = mirror(img); +} +``` + diff --git a/mshadow/README.md b/mshadow/README.md new file mode 100644 index 000000000000..86276af013e2 --- /dev/null +++ b/mshadow/README.md @@ -0,0 +1,8 @@ +Code Guide +==== +This readme contains notes about code in mshadow. MShadow generally follows Google's C++ Style. + +Convention +==== +* Basically, all the files ends in ```-inl.h, -inl.cuh``` are implementations, and can be ignored if only using mshadow +* The files ends in ```.h``` are heavily commented with [doxyen format](http://www.doxygen.org/), and can be used to generate the corresponding document. From 5efdcd67ba1f8d4c08f5f9aec3f3e21d80ec756f Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Thu, 29 Jan 2015 17:48:40 -0800 Subject: [PATCH 135/147] chg --- mshadow/extension/channel_unpool.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mshadow/extension/channel_unpool.h b/mshadow/extension/channel_unpool.h index 8646e8efbe33..6257391d2fd0 100644 --- a/mshadow/extension/channel_unpool.h +++ b/mshadow/extension/channel_unpool.h @@ -61,8 +61,8 @@ struct ChannelUnpoolingExp: }; /*! * \brief channel unpooling, do unroll over (local nearby) channels - * \param src source data - * \param nsize neighbor size + * \param src source data + * \param nsize neighbor size * \param stride stride of the pooling * \param pad number of padding at each side * \return expression of pooled result @@ -102,7 +102,7 @@ struct Plan, DType> { : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_), channel_(e.shape_[srcdim - 3]), height_(e.shape_[srcdim - 2]), pchannel_(e.pchannel_), - hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_){} + hnsize_(e.nsize_), stride_(e.kstride_), pad_(e.pad_){} MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { using namespace std; const DType vsrc = data_src_.Eval(i, j); @@ -117,8 +117,8 @@ struct Plan, DType> { DType val = static_cast(0); for (index_t cc = cstart; cc < cend; ++cc) { val += Reducer::PartialGrad(vsrc, - data_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x) * - grad_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x) ); + data_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x)) * + grad_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x); } return val; } From af75a475a0cd0590654a679d4cdc69eedf17e112 Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Sat, 7 Feb 2015 12:58:54 -0700 Subject: [PATCH 136/147] fix typo --- mshadow/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 7871697f5c36..eb8c5a04ea8a 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -411,7 +411,7 @@ struct Tensor: /*!\brief implement the assignment of same type */ inline Tensor & operator=(const Tensor &exp) { - dptr_ = exp.dptr; + dptr_ = exp.dptr_; shape_ = exp.shape_; stride_ = exp.stride_; stream_ = exp.stream_; From 4bacb7be7fa8cc8ff0ce340d76bb12eca515508c Mon Sep 17 00:00:00 2001 From: Johan Pauwels Date: Mon, 9 Feb 2015 01:08:29 +0100 Subject: [PATCH 137/147] Added config option to use system supplied BLAS library on OS X --- guide/config.mk | 4 ++-- make/mshadow.mk | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/guide/config.mk b/guide/config.mk index bcd30bc69a3c..0297304df6de 100644 --- a/guide/config.mk +++ b/guide/config.mk @@ -21,10 +21,10 @@ USE_CUDA_PATH = NONE # # choose the version of blas you want to use -# can be: mkl, blas, atlas, openblas +# can be: mkl, blas, atlas, openblas, apple USE_BLAS = blas # -# add path to intel libary, you may need it +# add path to intel library, you may need it # for MKL, if you did not add the path to enviroment variable # USE_INTEL_PATH = NONE diff --git a/make/mshadow.mk b/make/mshadow.mk index f4419f6c9f70..6e7b68b7b989 100644 --- a/make/mshadow.mk +++ b/make/mshadow.mk @@ -9,7 +9,7 @@ #---------------------------------------------------------------------------------------- MSHADOW_CFLAGS = -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -MSHADOW_LDFLAGS = -lm +MSHADOW_LDFLAGS = -lm MSHADOW_NVCCFLAGS = ifeq ($(USE_CUDA), 0) @@ -38,6 +38,9 @@ else ifeq ($(USE_BLAS), atlas) MSHADOW_LDFLAGS += -lcblas else ifeq ($(USE_BLAS), blas) MSHADOW_LDFLAGS += -lblas +else ifeq ($(USE_BLAS), apple) + MSHADOW_CFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/ + MSHADOW_LDFLAGS += -framework Accelerate endif ifeq ($(PS_PATH), NONE) From 1a4f981be9e8df0463233e67c99778f8845b4b80 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 12 Feb 2015 10:13:03 -0800 Subject: [PATCH 138/147] fix --- guide/basic.cpp | 7 ++++--- mshadow/tensor.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/guide/basic.cpp b/guide/basic.cpp index 260c6054d44b..a78d11dcb866 100644 --- a/guide/basic.cpp +++ b/guide/basic.cpp @@ -17,10 +17,11 @@ int main(void) { // Tensor object is only a handle, assignment means they have same data content // we can specify content type of a Tensor, if not specified, it is float bydefault Tensor mat2 = mat; - + mat = Tensor(data, Shape1(10)).FlatTo2D(); + // shaape of matrix, note size order is same as numpy - printf("%u X %u matrix\n", mat.size(1), mat.size(1)); - + printf("%u X %u matrix\n", mat.size(0), mat.size(1)); + return 0; // initialize all element to zero mat = 0.0f; // assign some values diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 7871697f5c36..eb8c5a04ea8a 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -411,7 +411,7 @@ struct Tensor: /*!\brief implement the assignment of same type */ inline Tensor & operator=(const Tensor &exp) { - dptr_ = exp.dptr; + dptr_ = exp.dptr_; shape_ = exp.shape_; stride_ = exp.stride_; stream_ = exp.stream_; From 7635a97f3c7034b3c4e5469f047361aa45041eb5 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 12 Feb 2015 10:30:21 -0800 Subject: [PATCH 139/147] change of model updater --- mshadow-ps/kv_array.h | 4 ++-- mshadow-ps/ps.h | 38 ++++++++++++++++++++++++++++++++------ mshadow-ps/ps_local-inl.h | 4 ++-- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h index 2bf10a47b226..8f9c96e2807c 100644 --- a/mshadow-ps/kv_array.h +++ b/mshadow-ps/kv_array.h @@ -53,7 +53,7 @@ void KVArray::setValue(const MessagePtr& msg) { if (my_val.empty()) { // initialize weight my_val.resize(kr.size(), 0); - CHECK_NOTNULL(updater_)->InitKey(key, my_val.data(), my_val.size()); + CHECK_NOTNULL(updater_)->InitModel(key, my_val.data(), my_val.size()); } // update weight @@ -70,7 +70,7 @@ void KVArray::getValue(const MessagePtr& msg) { if (my_val.empty()) { // initialize weight my_val.resize(kr.size(), 0); - CHECK_NOTNULL(updater_)->InitKey(msg->task.key_channel(), my_val.data(), my_val.size()); + CHECK_NOTNULL(updater_)->InitModel(msg->task.key_channel(), my_val.data(), my_val.size()); } // TODO store the kr in memory diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 5fa99109a505..6ee4762827d8 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -207,26 +207,52 @@ class IModelUpdater { * \param name name of parameter * \param val value of parameter */ - virtual void SetParam(const char *name, const char *val) = 0; + virtual void SetParam(const char *name, const char *val) {} /*! - * \brief init the server + * \brief init the model updater * \param rank the rank of the node * \param conf configuration */ - virtual void Init(int rank, const std::string &conf) = 0; + virtual void InitUpdater(int rank, const std::string &conf) {} + /*! + * \brief initialize the model + * \param key the key of data we point to + * \param dptr the data pointer + * \param size size of the parameter key + */ + virtual void InitModel(int key, DType *dptr, size_t size) { + this->InitModel_(key, Tensor(dptr, Shape1(size))); + } /*! - * \brief initialize the key + * update the model * \param key the key of data we point to * \param dptr the data pointer * \param size size of the parameter key */ - virtual void InitKey(int key, DType *dptr, size_t size) = 0; + virtual void Update(int key, DType *dptr, size_t size) { + this->Update_(key, Tensor(dptr, Shape1(size))); + } + + protected: + /*! + * \brief initialize the model, user can implement this one + * to take advantage of tensor operations + * \param key the key of data we point to + * \param data the tensor data corresponding to the data we want to initialize + */ + virtual void InitModel_(int key, Tensor data) { + utils::Error("InitModel: not implemented"); + } /*! + * \brief update the model, user can implement this one + * to take advantage of tensor operations * \param key the key of data we point to * \param dptr the data pointer * \param size size of the parameter key */ - virtual void Update(int key, DType *dptr, size_t size) = 0; + virtual void Update_(int key, Tensor data) { + utils::Error("InitModel: not implemented"); + } }; /*! * \brief create customized server diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index d6c9a2655019..c7303f3c331f 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -306,7 +306,7 @@ class LocalModel : public ISharedModel { virtual void ServerInitKey(Tensor weight, int key) { if (custom_server != NULL) { // intialize server, and ready for pullback - custom_server->InitKey(key, weight.dptr_, weight.MSize()); + custom_server->InitModel(key, weight.dptr_, weight.MSize()); this->PullReady(weight, key); } } @@ -354,7 +354,7 @@ class LocalModel : public ISharedModel { custom_server->SetParam(cfgvec[j].first.c_str(), cfgvec[j].second.c_str()); } - custom_server->Init(0, std::string()); + custom_server->InitUpdater(0, std::string()); } } protected: From d30327f4bd17a57734308dfab58dc754dfc3f48a Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 12 Feb 2015 10:35:09 -0800 Subject: [PATCH 140/147] Update ps_dist-inl.h --- mshadow-ps/ps_dist-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h index faea92bd71da..ed955e9da6a1 100644 --- a/mshadow-ps/ps_dist-inl.h +++ b/mshadow-ps/ps_dist-inl.h @@ -95,7 +95,7 @@ class MShadowServerNode : public PS::App { MShadowServerNode(const std::string &conf) : App() { updater_ = CreateModelUpdater(); - updater_->Init(myRank(), conf); + updater_->InitServer(myRank(), conf); shared_model_ = new PS::KVArray(); shared_model_->setUpdater(updater_); } From 8f46c8033535dd6f0dabae7f9956b3d813678713 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 12 Feb 2015 20:48:35 -0800 Subject: [PATCH 141/147] Update README.md --- guide/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guide/README.md b/guide/README.md index 5f539eb66da8..84573585eed9 100644 --- a/guide/README.md +++ b/guide/README.md @@ -4,7 +4,7 @@ This is a beginner's tutorial of mshadow. If you like mshadow and have ideas to Please send a pull-request if you would like to share your experience. -See also [Expression Template Tutorial][exp-template] +See also [Expression Template Tutorial](exp-template) **List of Topics** * [Tensor Data Structure](#tensor-data-structure) From 42e79307590d3821e79aa4d2de5e5d78d26da3c6 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 13 Feb 2015 11:12:11 -0800 Subject: [PATCH 142/147] ok --- mshadow-ps/README.md | 20 +++++- mshadow-ps/{ => example}/Makefile | 18 ++--- mshadow-ps/example/config.mk | 35 +++++++++ mshadow-ps/example/local_sum-inl.h | 111 +++++++++++++++++++++++++++++ mshadow-ps/example/local_sum.cpp | 4 ++ mshadow-ps/example/local_sum.cu | 4 ++ mshadow-ps/ps.h | 2 + mshadow-ps/ps_local-inl.h | 6 +- mshadow-ps/test.cpp | 58 --------------- mshadow-ps/test.cu | 47 ------------ 10 files changed, 187 insertions(+), 118 deletions(-) rename mshadow-ps/{ => example}/Makefile (60%) create mode 100644 mshadow-ps/example/config.mk create mode 100644 mshadow-ps/example/local_sum-inl.h create mode 100644 mshadow-ps/example/local_sum.cpp create mode 100644 mshadow-ps/example/local_sum.cu delete mode 100644 mshadow-ps/test.cpp delete mode 100644 mshadow-ps/test.cu diff --git a/mshadow-ps/README.md b/mshadow-ps/README.md index e92ec6b95e58..a9ceb7dca80d 100644 --- a/mshadow-ps/README.md +++ b/mshadow-ps/README.md @@ -1 +1,19 @@ -This folder contains parameter server abstraction for mshadow Tensor. +mshadow-ps +==== +### Parameter Server Interface for GPU Tensor + +mshadow-ps provides asynchronize parameter server interface for mshadow GPU/CPU Tensor. +This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in +an ***easy*** and ***unified*** way. + +Introduction +==== + +The interface of mshadow-ps is [ps.h](ps.h), + + +#### Getting Sum from Multiple GPUs + + + +#### A MultiGPU Neural Net diff --git a/mshadow-ps/Makefile b/mshadow-ps/example/Makefile similarity index 60% rename from mshadow-ps/Makefile rename to mshadow-ps/example/Makefile index 108ff756ff51..70cb724248f0 100644 --- a/mshadow-ps/Makefile +++ b/mshadow-ps/example/Makefile @@ -2,21 +2,23 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -DMSHADOW_STAND_ALONE=1 -std=c++11 -fopenmp -export LDFLAGS= -lm -lpthread -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) +include config.mk +include ../../make/mshadow.mk +export CFLAGS = -Wall -O3 -fopenmp -I../../ $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) # specify tensor path -BIN = test +BIN = local_sum.cpu OBJ = CUOBJ = -CUBIN = cutest +CUBIN = local_sum.gpu .PHONY: clean all -all: $(BIN) $(OBJ) +all: $(BIN) $(CUBIN) -test: test.cpp *.h -cutest: test.cu *.h +local_sum.cpu: local_sum.cpp +local_sum.gpu: local_sum.cu $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/mshadow-ps/example/config.mk b/mshadow-ps/example/config.mk new file mode 100644 index 000000000000..0297304df6de --- /dev/null +++ b/mshadow-ps/example/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 0 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas, apple +USE_BLAS = blas +# +# add path to intel library, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/mshadow-ps/example/local_sum-inl.h b/mshadow-ps/example/local_sum-inl.h new file mode 100644 index 000000000000..4876cf9baafa --- /dev/null +++ b/mshadow-ps/example/local_sum-inl.h @@ -0,0 +1,111 @@ +// This is an example demonstrating the usage of mshadow ps +#include +// use openmp to launch multiple threads +#include +#include +#include + +// simple util to print result +void Print_(mshadow::Tensor ts) { + for (mshadow::index_t i = 0; i < ts.size(0); ++i) { + for (mshadow::index_t j = 0; j < ts.size(1); ++j) { + printf("%g ", ts[i][j]); + } + printf("\n"); + } +} +template +inline void Print(mshadow::Tensor ts) { + mshadow::TensorContainer tmp; + tmp.Resize(ts.shape_); + mshadow::Copy(tmp, ts); + Print_(tmp); +} + +// this function is runed by specific thread +template +inline void RunWorkerThread(int devid, + mshadow::ps::ISharedModel *ps) { + // initialize tensor engine + mshadow::InitTensorEngine(devid); + mshadow::Stream *stream = mshadow::NewStream(); + // allocate tensor on xpu + mshadow::TensorContainer data(mshadow::Shape2(2, 3)); + // set the computation stream to the new allocated stream + // this will make subsequent computation whose target is data + // to use the stream, stream is needed for async execution in GPU + data.set_stream(stream); + // assume these operations sets the content of dataient + data[0] = 1.0f; + data[1] = devid + data[0]; + printf("dev%d: before sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + // intiaialize the key, register the shape on parameter server + ps->InitKey(data[0].shape_, 0, devid); + ps->InitKey(data[1].shape_, 1, devid); + // push data[0] out, for update, or aggregation + // 0 is the key of the data, devid is the current device id + ps->Push(data[0], 0, devid); + // pull request is used to request the data to be copied back + // once computation is done + ps->PullReq(data[0], 0, devid); + // computation can be done here.. + // the pull request handler will be overlapped with + // similar as previous call + ps->Push(data[1], 1, devid); + ps->PullReq(data[1], 1, devid); + // more computation can be done here... + // the computation will be overlapped + // PullWait will block until these request finishes + ps->PullWait(0, devid); + ps->PullWait(1, devid); + printf("dev%d: after sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); +} + +namespace mshadow { +namespace ps { +// model updater is used when update is happening on server side +// if we only use parameter server for sum aggregation +// this is not needed, but we must declare this function to return NULL +template<> +IModelUpdater *CreateModelUpdater(void) { + return NULL; +} +} +} + +template +inline int Run(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: device list\n"\ + "\tfor CPU the device list can be arbitrary\n"\ + "\tfor GPU the device list can be arbitrary\n"); + return 0; + } + // list of device ids + std::vector devs; + // initialization + for (int i = 1; i < argc; ++i) { + // record the device id + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + // intiaialize the ps + ps->Init(devs); + // use openmp to launch #devs threads + #pragma omp parallel num_threads(devs.size()) + { + int tid = omp_get_thread_num(); + RunWorkerThread(devs[tid], ps); + } + delete ps; + return 0; +} diff --git a/mshadow-ps/example/local_sum.cpp b/mshadow-ps/example/local_sum.cpp new file mode 100644 index 000000000000..7f0eed0df42e --- /dev/null +++ b/mshadow-ps/example/local_sum.cpp @@ -0,0 +1,4 @@ +#include "./local_sum-inl.h" +int main(int argc, char *argv[]) { + return Run(argc, argv); +} diff --git a/mshadow-ps/example/local_sum.cu b/mshadow-ps/example/local_sum.cu new file mode 100644 index 000000000000..6e839601a265 --- /dev/null +++ b/mshadow-ps/example/local_sum.cu @@ -0,0 +1,4 @@ +#include "./local_sum-inl.h" +int main(int argc, char *argv[]) { + return Run(argc, argv); +} diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 6ee4762827d8..4d9c2469a60a 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -65,6 +65,8 @@ class ISharedModel { } /*! * \brief initialize a key with certain shape + * must be called before using Push/PullReq/PullWait + * on the corresponding key * \param shape the shape content of the key * \param key the unique key to indicate the tensor * this is unique per device diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h index c7303f3c331f..fa092dc68bce 100644 --- a/mshadow-ps/ps_local-inl.h +++ b/mshadow-ps/ps_local-inl.h @@ -227,10 +227,8 @@ class LocalModel : public ISharedModel { }; virtual void InitKey_(Shape<2> shape, int key, int devid) { - if (devid == devices[0]) { - this->InitPullMap(key); - this->InitPushMap(key, shape); - } + this->InitPullMap(key); + this->InitPushMap(key, shape); } virtual void Push_(Tensor data, diff --git a/mshadow-ps/test.cpp b/mshadow-ps/test.cpp deleted file mode 100644 index 51467bbe0918..000000000000 --- a/mshadow-ps/test.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#define MSHADOW_STAND_ALONE 1 -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_DEPRECATE -#define NOMINMAX -#endif -#define MSHADOW_DIST_PS 0 -#include "./ps.h" -using namespace mshadow; -void Print1DTensor(Tensor const &ts) { - for (index_t i = 0; i < ts.size(0); ++i) { - printf("%.2f ", ts[i]); - } - printf("\n"); -} - -void Print2DTensor(Tensor const &ts) { - for (index_t i = 0; i < ts.size(0); ++i) { - Print1DTensor(ts[i]); - } -} - -int main(int argc, char *argv[]) { - if (argc < 2) { - printf("Usage:\n"); return 0; - } - int ndev = atoi(argv[1]); - ps::ISharedModel *ps = ps::CreateSharedModel("local"); - TensorContainer ts(Shape3(ndev,5,2)); - TensorContainer res(Shape3(ndev,5,2)); - std::vector devs; - for (int i = 0; i < ndev; ++i) { - devs.push_back(i); - //ts[i] = static_cast(1.0 + i); - } - ps->Init(devs); - for (int i = 0; i < ndev; ++i) { - ps->Push(ts[i], 3, i); - int a = i; - ps->PullWait(3, i); - ps->PullReq(res[i], 3, i, 0); - } - for (int i = 0; i < ndev; ++i) { - ps->PullWait(3, i); - ps->PullWait(3, i); - printf("----dev=%d----\n", i); - Print2DTensor(res[i]); - } - return 0; -} -namespace mshadow { -namespace ps { -template<> -mshadow::ps::IModelUpdater *CreateModelUpdater(){ - return NULL; -} -} -} \ No newline at end of file diff --git a/mshadow-ps/test.cu b/mshadow-ps/test.cu deleted file mode 100644 index 883f4a256110..000000000000 --- a/mshadow-ps/test.cu +++ /dev/null @@ -1,47 +0,0 @@ -#include "./ps.h" -using namespace mshadow; -void Print1DTensor(Tensor const &ts) { - for (index_t i = 0; i < ts.size(0); ++i) { - printf("%.2f ", ts[i]); - } - printf("\n"); -} - -void Print2DTensor(Tensor const &ts) { - for (index_t i = 0; i < ts.size(0); ++i) { - Print1DTensor(ts[i]); - } -} - -int main(int argc, char *argv[]) { - if (argc < 2) { - printf("Usage:\n"); return 0; - } - int ndev = atoi(argv[1]); - ps::IParamServer *ps = ps::Create("local"); - TensorContainer ts(Shape3(ndev,5,2)); - TensorContainer res(Shape3(ndev,5,2)); - TensorContainer tscpu(Shape3(ndev,5,2)); - TensorContainer rescpu(Shape3(ndev,5,2)); - std::vector devs; - for (int i = 0; i < ndev; ++i) { - devs.push_back(i); - tscpu[i] = 1.0 + i; - } - mshadow::Copy(ts, tscpu); - ps->Init(devs); - for (int i = 0; i < ndev; ++i) { - ps->Push(ts[i], 3, i); - ps->PullWait(3, i); - ps->PullReq(res[i], 3, i, 0); - } - for (int i = 0; i < ndev; ++i) { - ps->PullWait(3, i); - } - mshadow::Copy(rescpu, res); - for (int i = 0; i < ndev; ++i) { - printf("----dev=%d----\n", i); - Print2DTensor(rescpu[i]); - } - return 0; -} From d2fc7429df5fe8a6b609e4d7795816eba3c6e06b Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 13 Feb 2015 17:04:14 -0800 Subject: [PATCH 143/147] add multi-GPU ps example --- guide/basic.cpp | 4 +- guide/config.mk | 2 +- guide/defop.cpp | 4 +- guide/neuralnet/Makefile | 21 +- guide/neuralnet/README.md | 5 +- guide/neuralnet/config.mk | 35 ++++ guide/neuralnet/convnet.cu | 19 +- guide/neuralnet/nnet.cu | 13 +- guide/neuralnet/nnet_ps.cu | 308 +++++++++++++++++++++++++++++ guide/neuralnet/run.sh | 1 - mshadow-ps/example/local_sum-inl.h | 3 +- mshadow/tensor.h | 12 +- mshadow/tensor_cpu-inl.h | 7 + mshadow/tensor_gpu-inl.h | 24 +-- 14 files changed, 404 insertions(+), 54 deletions(-) create mode 100644 guide/neuralnet/config.mk create mode 100644 guide/neuralnet/nnet_ps.cu delete mode 100644 guide/neuralnet/run.sh diff --git a/guide/basic.cpp b/guide/basic.cpp index a78d11dcb866..cb6586d398d0 100644 --- a/guide/basic.cpp +++ b/guide/basic.cpp @@ -7,7 +7,7 @@ using namespace mshadow::expr; int main(void) { // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); + InitTensorEngine(); // assume we have a float space float data[20]; // create a 2 x 5 x 2 tensor, from existing space @@ -37,6 +37,6 @@ int main(void) { printf("\n"); } // shutdown tensor enigne after usage - ShutdownTensorEngine(); + ShutdownTensorEngine(); return 0; } diff --git a/guide/config.mk b/guide/config.mk index 0297304df6de..b28f41741543 100644 --- a/guide/config.mk +++ b/guide/config.mk @@ -22,7 +22,7 @@ USE_CUDA_PATH = NONE # # choose the version of blas you want to use # can be: mkl, blas, atlas, openblas, apple -USE_BLAS = blas +USE_BLAS = atlas # # add path to intel library, you may need it # for MKL, if you did not add the path to enviroment variable diff --git a/guide/defop.cpp b/guide/defop.cpp index 5843f75842de..074b81cc141e 100644 --- a/guide/defop.cpp +++ b/guide/defop.cpp @@ -26,7 +26,7 @@ struct maxoftwo { int main(void) { // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); + InitTensorEngine(); // take first subscript of the tensor Tensor mat = NewTensor(Shape2(2,3), 0.0f); Tensor mat2= NewTensor(Shape2(2,3), 0.0f); @@ -42,6 +42,6 @@ int main(void) { } FreeSpace(&mat); FreeSpace(&mat2); // shutdown tensor enigne after usage - ShutdownTensorEngine(); + ShutdownTensorEngine(); return 0; } diff --git a/guide/neuralnet/Makefile b/guide/neuralnet/Makefile index 7cb45e4afa2d..826384b5f3b0 100644 --- a/guide/neuralnet/Makefile +++ b/guide/neuralnet/Makefile @@ -2,31 +2,27 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../../ - - -ifeq ($(blas),1) - LDFLAGS= -lcblas -lm -lcudart -lcublas -lcurand - CFLAGS+= -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1 -else - LDFLAGS= -lm -lcudart -lcublas -lcurand -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -endif -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) +include config.mk +include ../../make/mshadow.mk +export CFLAGS = -Wall -O3 -I../../ -fopenmp $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) # specify tensor path BIN = OBJ = CUOBJ = -CUBIN = nnet convnet +CUBIN = nnet convnet nnet_ps .PHONY: clean all all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) nnet: nnet.cu +nnet_ps: nnet_ps.cu convnet: convnet.cu $(BIN) : - $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) $(OBJ) : $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) @@ -39,3 +35,4 @@ $(CUBIN) : clean: $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ + diff --git a/guide/neuralnet/README.md b/guide/neuralnet/README.md index fb5b59a3fb07..519acc436ce3 100644 --- a/guide/neuralnet/README.md +++ b/guide/neuralnet/README.md @@ -1,8 +1,7 @@ This folder contains a mshadow example of simple neural net implementation -To compile the code, type make: -* You will need to have CUDA and MKL installed. -* Alternatively, you can compile with CBLAS packages to replace MKL such as BLAS or ATLAS, type make blas=1 +To compile the code, modify ```config.mk``` to the setting you like and type make +* You will need to have CUDA and a version of BLAS To run the demo, download MNIST dataset from: http://yann.lecun.com/exdb/mnist/ unzip all the files into current folder diff --git a/guide/neuralnet/config.mk b/guide/neuralnet/config.mk new file mode 100644 index 000000000000..834b430c0f8c --- /dev/null +++ b/guide/neuralnet/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 1 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas, apple +USE_BLAS = atlas +# +# add path to intel library, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/guide/neuralnet/convnet.cu b/guide/neuralnet/convnet.cu index ce2f644abbaf..97b6a03fc416 100644 --- a/guide/neuralnet/convnet.cu +++ b/guide/neuralnet/convnet.cu @@ -193,9 +193,7 @@ int main(int argc, char *argv[]) { if(argc < 2) { printf("Usage: cpu or gpu\n"); return 0; } - srand(0); - InitTensorEngine(); - + srand(0); // settings int batch_size = 100; int insize = 28; @@ -207,9 +205,11 @@ int main(int argc, char *argv[]) { // choose which version to use INNet *net; - if(!strcmp(argv[1], "gpu")) { + if (!strcmp(argv[1], "gpu")) { + InitTensorEngine(); net = new ConvNet(batch_size, insize, nchannel, ksize, kstride, psize, num_out); - }else{ + } else { + InitTensorEngine(); net = new ConvNet(batch_size, insize, nchannel, ksize, kstride, psize, num_out); } @@ -227,7 +227,7 @@ int main(int argc, char *argv[]) { TensorContainer xtrain(Shape4(xtrain_.size(0), 1, insize, insize)); TensorContainer xtest(Shape4(xtest_.size(0), 1, insize, insize)); xtrain = reshape(xtrain_, xtrain.shape_); - xtest = reshape(xtest_, xtest.shape_); + xtest = reshape(xtest_, xtest.shape_); int num_iter = 20; @@ -257,6 +257,11 @@ int main(int argc, char *argv[]) { printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); } delete net; - ShutdownTensorEngine(); + + if (!strcmp(argv[1], "gpu")) { + ShutdownTensorEngine(); + } else { + ShutdownTensorEngine(); + } return 0; } diff --git a/guide/neuralnet/nnet.cu b/guide/neuralnet/nnet.cu index b4dea5f1d8ce..8e79cf608f3c 100644 --- a/guide/neuralnet/nnet.cu +++ b/guide/neuralnet/nnet.cu @@ -123,7 +123,6 @@ int main(int argc, char *argv[]) { printf("Usage: cpu or gpu\n"); return 0; } srand(0); - InitTensorEngine(); // settings int batch_size = 100; @@ -132,9 +131,11 @@ int main(int argc, char *argv[]) { int num_out = 10; // choose which version to use INNet *net; - if(!strcmp(argv[1], "gpu")) { + if (!strcmp(argv[1], "gpu")) { + InitTensorEngine(); net = new NNet(batch_size, num_in, num_hidden, num_out); - }else{ + } else { + InitTensorEngine(); net = new NNet(batch_size, num_in, num_hidden, num_out); } @@ -178,6 +179,10 @@ int main(int argc, char *argv[]) { printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); } delete net; - ShutdownTensorEngine(); + if (!strcmp(argv[1], "gpu")) { + ShutdownTensorEngine(); + } else { + ShutdownTensorEngine(); + } return 0; } diff --git a/guide/neuralnet/nnet_ps.cu b/guide/neuralnet/nnet_ps.cu new file mode 100644 index 000000000000..2a5dd828afbf --- /dev/null +++ b/guide/neuralnet/nnet_ps.cu @@ -0,0 +1,308 @@ +// this implements a simple two layer neural net +#include +#include +#include +// header file to use mshadow +#include +#include +// helper function to load mnist dataset +#include "./util.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +// define sigmoid operation +struct sigmoid { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f / (1.0f + expf(-a)); + } +}; + +/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ +class INNet{ + public: + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) = 0; + virtual void Backprop(const Tensor& gradout) = 0; + virtual ~INNet() {} +}; + +/*! + * \brief simple two layer neural net + * this implementation is device invariant + */ +template +class NNet : public INNet { + public: + // initialize the network + NNet(int batch_size, int num_in, int num_hidden, int num_out, + int devid, mshadow::ps::ISharedModel *ps) + : rnd(0), devid(devid), ps(ps) { + mshadow::SetDevice(devid); + stream = mshadow::NewStream(); + // set the computing streams + ninput.set_stream(stream); + nhidden.set_stream(stream); + nhiddenbak.set_stream(stream); + nout.set_stream(stream); + hbias.set_stream(stream); + obias.set_stream(stream); + g_hbias.set_stream(stream); + g_obias.set_stream(stream); + Wi2h.set_stream(stream); + Wh2o.set_stream(stream); + g_Wi2h.set_stream(stream); + g_Wh2o.set_stream(stream); + rnd.set_stream(stream); + // setup nodes + ninput.Resize(Shape2(batch_size, num_in)); + nhidden.Resize(Shape2(batch_size, num_hidden)); + nhiddenbak.Resize(nhidden.shape_); + nout.Resize(Shape2(batch_size, num_out)); + // setup bias + hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_); + obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_); + hbias = 0.0f; obias = 0.0f; + // setup weights + Wi2h.Resize(Shape2(num_in, num_hidden)); g_Wi2h.Resize(Wi2h.shape_); + Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_); + rnd.SampleGaussian(&Wi2h, 0, 0.01f); + rnd.SampleGaussian(&Wh2o, 0, 0.01f); + // initialize the key + ps->InitKey(Wi2h.shape_, 0, devid); + ps->InitKey(hbias.shape_, 1, devid); + ps->InitKey(Wh2o.shape_, 2, devid); + ps->InitKey(obias.shape_, 3, devid); + } + virtual ~NNet() { + mshadow::SetDevice(devid); + mshadow::DeleteStream(stream); + } + // forward propagation + virtual void Forward(const Tensor &inbatch, + Tensor &oubatch) { + // size is same conventsion as numpy + index_t batch_size = inbatch.size(0); + // copy data to input layer + Copy(ninput, inbatch, stream); + // wait the last pull requst on layer to complete + ps->PullWait(0, devid); + // first layer, fullc + nhidden = dot(ninput, Wi2h); + // wait the pull request on hbias to complete + ps->PullWait(1, devid); + nhidden+= repmat(hbias, batch_size); + // activation, sigmloid, backup activation in nhidden + nhidden = F(nhidden); + Copy(nhiddenbak, nhidden, stream); + // second layer fullc + ps->PullWait(2, devid); + nout = dot(nhiddenbak, Wh2o); + ps->PullWait(3, devid); + nout += repmat(obias, batch_size); + // softmax calculation + Softmax(nout, nout); + // copy result out + Copy(oubatch, nout, stream); + // Copy with stream is non-blocking, use wait to wait until copy finishes + stream->Wait(); + } + // back propagation + virtual void Backprop(const Tensor &gradout) { + // copy gradient to output layer + Copy(nout, gradout, stream); + // calc grad of layer 2 + g_obias = sum_rows(nout); + // sync proc defines the synchronization step + this->SyncProc(obias, g_obias, 3); + // update second layer weights + g_Wh2o = dot(nhiddenbak.T(), nout); + // backprop to layer 1 + nhiddenbak = dot(nout, Wh2o.T()); + this->SyncProc(Wh2o, g_Wh2o, 2); + // calculate gradient of sigmoid layer + nhidden = nhidden * (1.0f-nhidden) * nhiddenbak; + // calc grad of layer 1 + g_hbias = sum_rows(nhidden); + this->SyncProc(hbias, g_hbias, 1); + g_Wi2h = dot(ninput.T(), nhidden); + this->SyncProc(Wi2h, g_Wi2h, 0); + } + // synchronization function + template + inline void SyncProc(mshadow::Tensor weight, + mshadow::Tensor grad, + int data_key) { + // wait till last computation finishes + stream->Wait(); + ps->Push(grad, data_key, devid, -data_key); + ps->PullReq(grad, data_key, devid, -data_key, + UpdateEntry::ApplyUpdate, + new UpdateEntry(weight.FlatTo2D(), grad.FlatTo2D(), dim == 1)); + } + // data structure defined to help using callback function + struct UpdateEntry { + mshadow::Tensor weight; + mshadow::Tensor grad; + bool is_bias; + // constructor + UpdateEntry(mshadow::Tensor weight, + mshadow::Tensor grad, + bool is_bias) + : weight(weight), grad(grad), + is_bias(is_bias) {} + inline void Update(mshadow::Stream *stream) { + weight.set_stream(stream); + const float wd = 0.00001; + const float eta = 0.8; + if (!is_bias) { + weight -= eta * (wd * weight + grad); + } else { + weight -= eta * grad; + } + } + // callback function to apply update + inline static void ApplyUpdate(mshadow::Stream *stream, void *arg) { + UpdateEntry *e = static_cast(arg); + e->Update(stream); + delete e; + } + }; + + private: + // computing stream + mshadow::Stream *stream; + // device id + int devid; + // parameter server interface + mshadow::ps::ISharedModel *ps; + // random seed generator + Random rnd; + // nodes in neural net + TensorContainer ninput, nhidden, nhiddenbak, nout; + // hidden bias, gradient + TensorContainer hbias, obias, g_hbias, g_obias; + // weight gradient + TensorContainer Wi2h, Wh2o, g_Wi2h, g_Wh2o; +}; + +// helper function to get the max inde +inline int MaxIndex(Tensor pred) { + int maxidx = 0; + for(index_t i = 1; i < pred.size(0); ++i) { + if(pred[i] > pred[maxidx]) maxidx = (int)i; + } + return maxidx; +} + +namespace mshadow { +namespace ps { +// model updater is used when update is happening on server side +// if we only use parameter server for sum aggregation +// this is not needed, but we must declare this function to return NULL +template<> +IModelUpdater *CreateModelUpdater(void) { + return NULL; +} +} +} + +template +inline int Run(int argc, char *argv[]) { + srand(0); + // settings + int batch_size = 100; + int num_in = 28 * 28; + int num_hidden = 100; + int num_out = 10; + int ndev = argc - 2; + if (batch_size % ndev != 0) { + fprintf(stderr, "choose number of devices ndev such that 100 MOD ndev == 0\n"); + return 0; + } + // choose which version to use + std::vector devs; + for (int i = 2; i < argc; ++i) { + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + ps->Init(devs); + + std::vector nets(ndev); + for (int i = 0; i < ndev; ++i) { + mshadow::InitTensorEngine(devs[i]); + nets[i] = new NNet(batch_size / ndev, num_in, num_hidden, num_out, devs[i], ps); + } + + // label + std::vector ytrain, ytest; + // data + TensorContainer xtrain, xtest; + LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true); + LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false); + int num_iter = 20; + + for (int i = 0; i < num_iter; ++ i) { + // mini-batch per device + int step = batch_size / ndev; + // running parallel threads + #pragma omp parallel num_threads(ndev) + { + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(step, num_out)); + int tid = omp_get_thread_num(); + mshadow::SetDevice(devs[tid]); + for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) { + nets[tid]->Forward(xtrain.Slice(j + tid * step, j + (tid + 1) * step), pred); + // set gradient into pred + for (int k = 0; k < step; ++ k) { + pred[k][ytrain[j + tid * step + k]] -= 1.0f; + } + // scale gradient by batchs zie + pred *= 1.0f / batch_size; + // run backprop + nets[tid]->Backprop(pred); + } + } + // evaluation + long nerr = 0; + #pragma omp parallel num_threads(ndev) reduction(+:nerr) + { + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(step, num_out)); + int tid = omp_get_thread_num(); + mshadow::SetDevice(devs[tid]); + for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) { + nets[tid]->Forward(xtest.Slice(j + tid * step, j + (tid + 1) * step), pred); + for (int k = 0; k < step; ++ k) { + nerr += MaxIndex(pred[k]) != ytest[j + tid * step + k]; + } + } + } + printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); + } + + for(int i = 0; i < ndev; ++i) { + mshadow::SetDevice(devs[i]); + delete nets[i]; + ShutdownTensorEngine(); + } + return 0; +} +int main(int argc, char *argv[]) { + if (argc < 3) { + printf("Usage: devicelist\n"\ + "\tExample1: ./nnet_ps cpu 1 2 3\n"\ + "\tExample2: ./nnet_ps gpu 0 1\n"); + return 0; + } + if (!strcmp(argv[1], "cpu")) { + Run(argc, argv); + } else { + Run(argc, argv); + } + return 0; +} diff --git a/guide/neuralnet/run.sh b/guide/neuralnet/run.sh deleted file mode 100644 index 8b137891791f..000000000000 --- a/guide/neuralnet/run.sh +++ /dev/null @@ -1 +0,0 @@ - diff --git a/mshadow-ps/example/local_sum-inl.h b/mshadow-ps/example/local_sum-inl.h index 4876cf9baafa..e731b2e9a01a 100644 --- a/mshadow-ps/example/local_sum-inl.h +++ b/mshadow-ps/example/local_sum-inl.h @@ -67,6 +67,7 @@ inline void RunWorkerThread(int devid, // print normally since Copy will block Print(data); printf("====================\n"); + mshadow::DeleteStream(stream); } namespace mshadow { @@ -86,7 +87,7 @@ inline int Run(int argc, char *argv[]) { if (argc < 2) { printf("Usage: device list\n"\ "\tfor CPU the device list can be arbitrary\n"\ - "\tfor GPU the device list can be arbitrary\n"); + "\tfor GPU the device list need to be actual device index\n"); return 0; } // list of device ids diff --git a/mshadow/tensor.h b/mshadow/tensor.h index eb8c5a04ea8a..619d66758729 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -434,23 +434,29 @@ struct Tensor: * this function should be called before all GPU tensor operations, * for using tensors in CPU, this call is actually not needed * \param device_id GPU device id to be choosed + * \tparam Device the device type */ +template inline void InitTensorEngine(int device_id = 0); /*! - * \brief Shutdown tensor engine, - * this function should be called after all GPU tensor operations, - * for using tensors in CPU, this call is actually not needed + * \brief Shutdown tensor engine on current device + * this function should be called after all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + * \tparam Device the device type */ +template inline void ShutdownTensorEngine(void); /*! * \brief set the device of current thread to work on * \param devid the device id + * \tparam Device the device type */ template inline void SetDevice(int devid); /*! * \brief create a new stream from system * \return a pointer to the created stream + * \tparam Device the device type */ template inline Stream *NewStream(void); diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index c3ddbe107d3f..240c65faffd6 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -12,6 +12,13 @@ #include "./sse-inl.h" namespace mshadow { +template<> +inline void InitTensorEngine(int dev_id) { +} +template<> +inline void ShutdownTensorEngine(void) { +} + template<> inline void SetDevice(int devid) { } diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 9c2eb48ccf52..ffd203d33a1a 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -10,20 +10,9 @@ #include "./tensor.h" namespace mshadow { -#if !(MSHADOW_USE_CUDA) -// do nothing if no GPU operation is involved -inline void InitTensorEngine(int dev_id) { -} -inline void ShutdownTensorEngine(void) { -} -#else -#if (MSHADOW_USE_NVML) -inline int AutoSelectDevice(int device_count) { - // TODO(bing): nvml device id and cuda device id are not consistent - return 0; -} -#endif -inline void InitTensorEngine(int dev_id) { +#if MSHADOW_USE_CUDA +template<> +inline void InitTensorEngine(int dev_id) { cudaDeviceProp prop; int device_id = 0; int device_count = 0; @@ -31,9 +20,7 @@ inline void InitTensorEngine(int dev_id) { utils::Check(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration"); if (dev_id < 0) { -#if (MSHADOW_USE_NVML) - device_id = AutoSelectDevice(device_count); -#endif + device_id = 0; } else { device_id = dev_id; } @@ -43,7 +30,8 @@ inline void InitTensorEngine(int dev_id) { printf("Use CUDA Device %d: %s\n", device_id, prop.name); cublasInit(); } -inline void ShutdownTensorEngine(void) { +template<> +inline void ShutdownTensorEngine(void) { cublasShutdown(); } template<> From 77c75a6a4fba9f41421217bfc1bce6c531915de8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 13 Feb 2015 20:43:37 -0800 Subject: [PATCH 144/147] add mshadow-ps intro --- guide/README.md | 9 +- guide/mshadow-ps/Makefile | 36 +++++++ guide/mshadow-ps/README.md | 173 +++++++++++++++++++++++++++++++ guide/mshadow-ps/config.mk | 35 +++++++ guide/mshadow-ps/local_sum-inl.h | 113 ++++++++++++++++++++ guide/mshadow-ps/local_sum.cpp | 4 + guide/mshadow-ps/local_sum.cu | 4 + guide/neuralnet/nnet_ps.cu | 8 +- 8 files changed, 376 insertions(+), 6 deletions(-) create mode 100644 guide/mshadow-ps/Makefile create mode 100644 guide/mshadow-ps/README.md create mode 100644 guide/mshadow-ps/config.mk create mode 100644 guide/mshadow-ps/local_sum-inl.h create mode 100644 guide/mshadow-ps/local_sum.cpp create mode 100644 guide/mshadow-ps/local_sum.cu diff --git a/guide/README.md b/guide/README.md index 84573585eed9..ee36ca789208 100644 --- a/guide/README.md +++ b/guide/README.md @@ -3,8 +3,9 @@ Tutorial of mshadow This is a beginner's tutorial of mshadow. If you like mshadow and have ideas to improve this tutorial, you are more than welcomed:) Please send a pull-request if you would like to share your experience. - -See also [Expression Template Tutorial](exp-template) +See also other related materials about mshadow +* [Expression Template Tutorial](exp-template) +* [Writing Multi-GPU and Distributed ML](mshadow-ps) **List of Topics** * [Tensor Data Structure](#tensor-data-structure) @@ -184,7 +185,7 @@ using namespace mshadow::expr; int main(void) { // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); + InitTensorEngine(); // assume we have a float space float data[20]; // create a 2 x 5 x 2 tensor, from existing space @@ -213,7 +214,7 @@ int main(void) { printf("\n"); } // shutdown tensor enigne after usage - ShutdownTensorEngine(); + ShutdownTensorEngine(); return 0; } ``` diff --git a/guide/mshadow-ps/Makefile b/guide/mshadow-ps/Makefile new file mode 100644 index 000000000000..70cb724248f0 --- /dev/null +++ b/guide/mshadow-ps/Makefile @@ -0,0 +1,36 @@ +# set LD_LIBRARY_PATH +export CC = gcc +export CXX = g++ +export NVCC =nvcc +include config.mk +include ../../make/mshadow.mk +export CFLAGS = -Wall -O3 -fopenmp -I../../ $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) + +# specify tensor path +BIN = local_sum.cpu +OBJ = +CUOBJ = +CUBIN = local_sum.gpu +.PHONY: clean all + +all: $(BIN) $(CUBIN) + +local_sum.cpu: local_sum.cpp +local_sum.gpu: local_sum.cu + +$(BIN) : + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) + +$(OBJ) : + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) + +$(CUOBJ) : + $(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^) + +$(CUBIN) : + $(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^) + +clean: + $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ diff --git a/guide/mshadow-ps/README.md b/guide/mshadow-ps/README.md new file mode 100644 index 000000000000..263935a28700 --- /dev/null +++ b/guide/mshadow-ps/README.md @@ -0,0 +1,173 @@ +mshadow-ps +==== +### Parameter Server Interface for GPU Tensor + +mshadow-ps provides asynchronize parameter server interface for mshadow GPU/CPU Tensor. +This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in +an ***easy*** and ***unified*** way. + +####List of Resources +* [Library Interface Header](../../mshadow-ps/ps.h) +* Tutorial in this page + +Tutorial +==== +Suppose that we are now implementing a Multi-GPU learning program. +One way to do that is through data parallelism. We can launch many +threads, with each thread compute gradient on one GPU, and aggregate +the statistics together. +However, the gradient synchronization step could be cost time, and in +many cases, we can do the computation in an smarter way, so that +we ***overlaps the computation with the synchronization***. + +mshadow-ps provides interface to do such synchronization in an easy way. +The following documents provides a way + +### Getting Sum from Multiple GPUs +We first get familiar with the interface of mshadow-ps. Through the following +program in [local_sum-inl.h](local_sum-inl.h). You can compile the program +by setup the [config.mk](config.mk) according to your computers's enviroment, and type make. + +In the following program, each thread first does some computation locally, then tries to get the sum +of ```data``` through mshadow-ps interface. +There are four key functions in ```ISharedModel``` interface +* [InitKey](../../mshadow-ps/ps.h#L76) allocates a key to specific tensor shape +* [Push](../../mshadow-ps/ps.h#L100) pushes out the local data to the synchronization interface + - The data pushed by different devices will be aggregated together by key + - Push is an asynchronize call and returns immediately +* [PullReq](../../mshadow-ps/ps.h#L122) requests the result of synchronization to be copied back + - In the local default case, the synchronized result is the sum of pushed data + - mshadow-ps also support the weight update on server side, where the result of PullReq is the updated weight instead of sum of gradient + - PullReq is also asynchronize +* [PullWait](../../mshadow-ps/ps.h#L87) wait until the pull request of corresponding key finishes + +```c++ +// this function is runed by specific thread +template +inline void RunWorkerThread(int devid, + mshadow::ps::ISharedModel *ps) { + // initialize tensor engine + mshadow::InitTensorEngine(devid); + mshadow::Stream *stream = mshadow::NewStream(); + // allocate tensor on xpu + mshadow::TensorContainer data(mshadow::Shape2(2, 3)); + // set the computation stream to the new allocated stream + // this will make subsequent computation whose target is data + // to use the stream, stream is needed for async execution in GPU + data.set_stream(stream); + // assume these operations sets the content of dataient + data[0] = 1.0f; + data[1] = devid + data[0]; + printf("dev%d: before sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + // intiaialize the key, register the shape on parameter server + ps->InitKey(data[0].shape_, 0, devid); + ps->InitKey(data[1].shape_, 1, devid); + // push data[0] out, for update, or aggregation + // 0 is the key of the data, devid is the current device id + ps->Push(data[0], 0, devid); + // pull request is used to request the data to be copied back + // once computation is done + ps->PullReq(data[0], 0, devid); + // computation can be done here.. + // the pull request handler will be overlapped with + // similar as previous call + ps->Push(data[1], 1, devid); + ps->PullReq(data[1], 1, devid); + // more computation can be done here... + // the computation will be overlapped + // PullWait will block until these request finishes + ps->PullWait(0, devid); + ps->PullWait(1, devid); + printf("dev%d: after sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + mshadow::DeleteStream(stream); + mshadow::ShutdownTensorEngine(); +} + +template +inline int Run(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: device list\n"\ + "\tfor CPU the device list can be arbitrary\n"\ + "\tfor GPU the device list need to be actual device index\n"); + return 0; + } + // list of device ids + std::vector devs; + // initialization + for (int i = 1; i < argc; ++i) { + // record the device id + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + // intiaialize the ps + ps->Init(devs); + // use openmp to launch #devs threads + #pragma omp parallel num_threads(devs.size()) + { + int tid = omp_get_thread_num(); + RunWorkerThread(devs[tid], ps); + } + delete ps; + return 0; +} +``` +In the above example, we did not do weight update on server side, so the synchronization result is +simply the sum of data on each device. The key property of this interface is that the Push and PullReq are asynchronize. +* We can call these two functions once the gradient is ready, and the mshadow-ps will do the data synchronization in the background. +* When we need the result of synchronization, we simply call PullWait to wait the synchronization task to finish. +* Such interface allows us to do additional computation between the Push/PullReq and PullWait + +### A MultiGPU Neural Net +To get a more concrete understanding of the interface. We give an example of multi-GPU two layer neuralnet +in [../neuralnet/nnet_ps.cu](../neuralnet/nnet_ps.cu). The general idea is follows +* Push and PullReq is called once we get the gradient of certain layer +* PullWait is called before we do forward on that layer next time +* This creates a ***time lag*** between the backprop and next forward to that layer + - mshadow-ps do synchronization concurrently with computations during the time lag + - The time lag is big for latter layers, which also usually need more time to synchronize + +There are several note of the mshadow-ps on the neural net code +* Callback function in PullReq + - A callback function can be pass to PullReq to be called when the request complete + - We place weight update in the callback to perform update when we get the gradient sum +* Computing stream + - Due to GPU's programming model, we need to do computation on non-default stream + - Use set_stream in mshadow tensors to set stream to computation stream + - To report error when you did not use stream, you can compile with -DMSHADOW_FORCE_STREAM + +We should note thate because the example runs on MNIST, which is an quite small dataset, you may not observe +speedup with multiple cards. However, you will find significant speedup when you run on other tasks. +The newest version of [cxxnet](https://github.com/antinucleon/cxxnet) + +### Moving Parameter Update to the Server +In all the examples so far, we use mshadow-ps to get the aggregated sum of gradients, and update +weights locally on each GPU. For more advanced usage of mshadow-ps, we can move the weight update +to the server. The communication pattern is as follows +* Each thread still call Push to push out gradient +* The server will apply the update rule to update the weight +* Each thread call PullReq to pull back the weight from server + +Such update pattern is suitable under distributed setting. To do so, user need to implement an +[IModelUpdater](../../mshadow-ps/ps.h#L202) interface. And define the following CreateModelUpdater function +in the program +```c++ +namespace mshadow { +namespace ps { +template<> +IModelUpdater *CreateModelUpdater() { + return new MyModelUpdater(); +} +} +} +``` +Before calling ISharedModel.Init, user need to call ```ps->SetParam("update_on_server", "1")``` to set the update +mode on the server side. If user uses distributed shared model, user must define ModelUpdater. diff --git a/guide/mshadow-ps/config.mk b/guide/mshadow-ps/config.mk new file mode 100644 index 000000000000..0297304df6de --- /dev/null +++ b/guide/mshadow-ps/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 0 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas, apple +USE_BLAS = blas +# +# add path to intel library, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/guide/mshadow-ps/local_sum-inl.h b/guide/mshadow-ps/local_sum-inl.h new file mode 100644 index 000000000000..5120590a2768 --- /dev/null +++ b/guide/mshadow-ps/local_sum-inl.h @@ -0,0 +1,113 @@ +// This is an example demonstrating the usage of mshadow ps +#include +// use openmp to launch multiple threads +#include +#include +#include + +// simple util to print result +void Print_(mshadow::Tensor ts) { + for (mshadow::index_t i = 0; i < ts.size(0); ++i) { + for (mshadow::index_t j = 0; j < ts.size(1); ++j) { + printf("%g ", ts[i][j]); + } + printf("\n"); + } +} +template +inline void Print(mshadow::Tensor ts) { + mshadow::TensorContainer tmp; + tmp.Resize(ts.shape_); + mshadow::Copy(tmp, ts); + Print_(tmp); +} + +// this function is runed by specific thread +template +inline void RunWorkerThread(int devid, + mshadow::ps::ISharedModel *ps) { + // initialize tensor engine + mshadow::InitTensorEngine(devid); + mshadow::Stream *stream = mshadow::NewStream(); + // allocate tensor on xpu + mshadow::TensorContainer data(mshadow::Shape2(2, 3)); + // set the computation stream to the new allocated stream + // this will make subsequent computation whose target is data + // to use the stream, stream is needed for async execution in GPU + data.set_stream(stream); + // assume these operations sets the content of dataient + data[0] = 1.0f; + data[1] = devid + data[0]; + printf("dev%d: before sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + // intiaialize the key, register the shape on parameter server + ps->InitKey(data[0].shape_, 0, devid); + ps->InitKey(data[1].shape_, 1, devid); + // push data[0] out, for update, or aggregation + // 0 is the key of the data, devid is the current device id + ps->Push(data[0], 0, devid); + // pull request is used to request the data to be copied back + // once computation is done + ps->PullReq(data[0], 0, devid); + // computation can be done here.. + // the pull request handler will be overlapped with + // similar as previous call + ps->Push(data[1], 1, devid); + ps->PullReq(data[1], 1, devid); + // more computation can be done here... + // the computation will be overlapped + // PullWait will block until these request finishes + ps->PullWait(0, devid); + ps->PullWait(1, devid); + printf("dev%d: after sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + mshadow::DeleteStream(stream); + mshadow::ShutdownTensorEngine(); +} + +namespace mshadow { +namespace ps { +// model updater is used when update is happening on server side +// if we only use parameter server for sum aggregation +// this is not needed, but we must declare this function to return NULL +template<> +IModelUpdater *CreateModelUpdater(void) { + return NULL; +} +} +} + +template +inline int Run(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: device list\n"\ + "\tfor CPU the device list can be arbitrary\n"\ + "\tfor GPU the device list need to be actual device index\n"); + return 0; + } + // list of device ids + std::vector devs; + // initialization + for (int i = 1; i < argc; ++i) { + // record the device id + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + // intiaialize the ps + ps->Init(devs); + // use openmp to launch #devs threads + #pragma omp parallel num_threads(devs.size()) + { + int tid = omp_get_thread_num(); + RunWorkerThread(devs[tid], ps); + } + delete ps; + return 0; +} diff --git a/guide/mshadow-ps/local_sum.cpp b/guide/mshadow-ps/local_sum.cpp new file mode 100644 index 000000000000..7f0eed0df42e --- /dev/null +++ b/guide/mshadow-ps/local_sum.cpp @@ -0,0 +1,4 @@ +#include "./local_sum-inl.h" +int main(int argc, char *argv[]) { + return Run(argc, argv); +} diff --git a/guide/mshadow-ps/local_sum.cu b/guide/mshadow-ps/local_sum.cu new file mode 100644 index 000000000000..6e839601a265 --- /dev/null +++ b/guide/mshadow-ps/local_sum.cu @@ -0,0 +1,4 @@ +#include "./local_sum-inl.h" +int main(int argc, char *argv[]) { + return Run(argc, argv); +} diff --git a/guide/neuralnet/nnet_ps.cu b/guide/neuralnet/nnet_ps.cu index 2a5dd828afbf..996bbe266d7b 100644 --- a/guide/neuralnet/nnet_ps.cu +++ b/guide/neuralnet/nnet_ps.cu @@ -1,4 +1,7 @@ -// this implements a simple two layer neural net +// this implements a simple two layer Multi-GPU neural net +// this implementation uses mshadow-ps to get gradient aggregation +// between cards +// this code is modified from nnet.cu #include #include #include @@ -22,7 +25,8 @@ struct sigmoid { /*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ class INNet{ public: - virtual void Forward(const Tensor& inbatch, Tensor &oubatch) = 0; + virtual void Forward(const Tensor& inbatch, + Tensor &oubatch) = 0; virtual void Backprop(const Tensor& gradout) = 0; virtual ~INNet() {} }; From bb796f124459d871b182b351a9f1dc1d1f2e7c8e Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 13 Feb 2015 20:48:02 -0800 Subject: [PATCH 145/147] ok --- README.md | 10 ++- doc/README.md | 1 + guide/mshadow-ps/config.mk | 4 +- guide/neuralnet/README.md | 9 ++- guide/neuralnet/config.mk | 2 +- mshadow-ps/README.md | 17 +---- mshadow-ps/example/Makefile | 36 ---------- mshadow-ps/example/config.mk | 35 --------- mshadow-ps/example/local_sum-inl.h | 112 ----------------------------- mshadow-ps/example/local_sum.cpp | 4 -- mshadow-ps/example/local_sum.cu | 4 -- 11 files changed, 21 insertions(+), 213 deletions(-) delete mode 100644 mshadow-ps/example/Makefile delete mode 100644 mshadow-ps/example/config.mk delete mode 100644 mshadow-ps/example/local_sum-inl.h delete mode 100644 mshadow-ps/example/local_sum.cpp delete mode 100644 mshadow-ps/example/local_sum.cu diff --git a/README.md b/README.md index 3310efc12cdb..a2db5af76264 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,12 @@ mshadow: Matrix Shadow MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The goal of mshadow is to support ***efficient***, ***device invariant*** and ***simple*** tensor library for machine learning project that aims for both simplicity and performance. +MShadow also provides interface that allows writing Multi-GPU and distributed deep learning programs in an easy and unified way. + * [Contributors](https://github.com/tqchen/mshadow/graphs/contributors) * [Tutorial](guide) * [Documentation](doc) +* [Parameter Server Interface for GPU Tensor](guide/mshadow-ps) Features ===== @@ -18,8 +21,11 @@ Features * Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called * Lightweight library: light amount of code to support frequently used functions in machine learning * Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required. - +* MultiGPU and Distributed ML: mshadow-ps interface allows user to write efficient MultiGPU and distributed programs in an unified way. Related Projects ===== -* CXXNET: neural network implementation based on mshadow: https://github.com/antinucleon/cxxnet +* [CXXNET: large-scale deep learning backed by mshadow](https://github.com/antinucleon/cxxnet) +* [Parameter Server](https://github.com/mli/parameter_server) + - Parameter server project provides distributed back-end for mshadow-ps + - mshadow-ps extends original parameter server to support async updates for GPU Tensor diff --git a/doc/README.md b/doc/README.md index 3502b3e5d351..03506565fcb7 100644 --- a/doc/README.md +++ b/doc/README.md @@ -8,6 +8,7 @@ This is the documentation for mshadow: A Lightweight CPU/GPU Matrix/Tensor Templ * API Documentation - You can run ```./mkdoc.sh``` to make the document locally * [Tutorial about Expression Template](../guide/exp-template) +* [Writing Multi-GPU and Distributed ML](../guide/mshadow-ps) * [Compile Configuration script](../make) * [Expression API](#expression-api) - Expression api introduces the concept of expression in mshadow diff --git a/guide/mshadow-ps/config.mk b/guide/mshadow-ps/config.mk index 0297304df6de..834b430c0f8c 100644 --- a/guide/mshadow-ps/config.mk +++ b/guide/mshadow-ps/config.mk @@ -13,7 +13,7 @@ #---------------------------------------------------------------------------------------- # whether use CUDA during compile -USE_CUDA = 0 +USE_CUDA = 1 # add the path to CUDA libary to link and compile flag # if you have already add them to enviroment variable, leave it as NONE @@ -22,7 +22,7 @@ USE_CUDA_PATH = NONE # # choose the version of blas you want to use # can be: mkl, blas, atlas, openblas, apple -USE_BLAS = blas +USE_BLAS = atlas # # add path to intel library, you may need it # for MKL, if you did not add the path to enviroment variable diff --git a/guide/neuralnet/README.md b/guide/neuralnet/README.md index 519acc436ce3..dd181e758c65 100644 --- a/guide/neuralnet/README.md +++ b/guide/neuralnet/README.md @@ -1,4 +1,5 @@ -This folder contains a mshadow example of simple neural net implementation +Example Neural Net code with MShadow +==== To compile the code, modify ```config.mk``` to the setting you like and type make * You will need to have CUDA and a version of BLAS @@ -7,3 +8,9 @@ To run the demo, download MNIST dataset from: http://yann.lecun.com/exdb/mnist/ unzip all the files into current folder and run by ./nnet cpu or ./nnet gpu. ./convnet cpu or ./convnet gpu + +MultiGPU Version +==== +* If you have two GPUs, you can run it by ```./nnet_ps gpu 0 1```. +* You can run it using CPUs ```./nnet_ps cpu 0 1```. +* This is an demonstration of mshadow-ps interface, see introduction in [../mshadow-ps](../mshadow-ps) diff --git a/guide/neuralnet/config.mk b/guide/neuralnet/config.mk index 834b430c0f8c..112396d5557b 100644 --- a/guide/neuralnet/config.mk +++ b/guide/neuralnet/config.mk @@ -22,7 +22,7 @@ USE_CUDA_PATH = NONE # # choose the version of blas you want to use # can be: mkl, blas, atlas, openblas, apple -USE_BLAS = atlas +USE_BLAS = mkl # # add path to intel library, you may need it # for MKL, if you did not add the path to enviroment variable diff --git a/mshadow-ps/README.md b/mshadow-ps/README.md index a9ceb7dca80d..9c90cc9f3c9d 100644 --- a/mshadow-ps/README.md +++ b/mshadow-ps/README.md @@ -1,19 +1,4 @@ mshadow-ps ==== -### Parameter Server Interface for GPU Tensor +This folder contains mshadow-ps parameter server interface for mshadow GPU/CPU Tensor. See [guide on mshadow-ps](../guide/mshadow-ps) for introduction of the interface. -mshadow-ps provides asynchronize parameter server interface for mshadow GPU/CPU Tensor. -This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in -an ***easy*** and ***unified*** way. - -Introduction -==== - -The interface of mshadow-ps is [ps.h](ps.h), - - -#### Getting Sum from Multiple GPUs - - - -#### A MultiGPU Neural Net diff --git a/mshadow-ps/example/Makefile b/mshadow-ps/example/Makefile deleted file mode 100644 index 70cb724248f0..000000000000 --- a/mshadow-ps/example/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -# set LD_LIBRARY_PATH -export CC = gcc -export CXX = g++ -export NVCC =nvcc -include config.mk -include ../../make/mshadow.mk -export CFLAGS = -Wall -O3 -fopenmp -I../../ $(MSHADOW_CFLAGS) -export LDFLAGS= -lm $(MSHADOW_LDFLAGS) -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) - -# specify tensor path -BIN = local_sum.cpu -OBJ = -CUOBJ = -CUBIN = local_sum.gpu -.PHONY: clean all - -all: $(BIN) $(CUBIN) - -local_sum.cpu: local_sum.cpp -local_sum.gpu: local_sum.cu - -$(BIN) : - $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) - -$(OBJ) : - $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) - -$(CUOBJ) : - $(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^) - -$(CUBIN) : - $(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^) - -clean: - $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ diff --git a/mshadow-ps/example/config.mk b/mshadow-ps/example/config.mk deleted file mode 100644 index 0297304df6de..000000000000 --- a/mshadow-ps/example/config.mk +++ /dev/null @@ -1,35 +0,0 @@ -#--------------------------------------------------------------------------------------- -# mshadow: the configuration compile script -# -# This is configuration script that you can use to compile mshadow -# Usage: -# -# include config.mk in your Makefile, or directly include the definition of variables -# include mshadow.mk after the variables are set -# -# Add MSHADOW_CFLAGS to the compile flags -# Add MSHADOW_LDFLAGS to the linker flags -# Add MSHADOW_NVCCFLAGS to the nvcc compile flags -#---------------------------------------------------------------------------------------- - -# whether use CUDA during compile -USE_CUDA = 0 - -# add the path to CUDA libary to link and compile flag -# if you have already add them to enviroment variable, leave it as NONE -USE_CUDA_PATH = NONE - -# -# choose the version of blas you want to use -# can be: mkl, blas, atlas, openblas, apple -USE_BLAS = blas -# -# add path to intel library, you may need it -# for MKL, if you did not add the path to enviroment variable -# -USE_INTEL_PATH = NONE - -# whether compile with parameter server -USE_DIST_PS = 0 -PS_PATH = NONE -PS_THIRD_PATH = NONE diff --git a/mshadow-ps/example/local_sum-inl.h b/mshadow-ps/example/local_sum-inl.h deleted file mode 100644 index e731b2e9a01a..000000000000 --- a/mshadow-ps/example/local_sum-inl.h +++ /dev/null @@ -1,112 +0,0 @@ -// This is an example demonstrating the usage of mshadow ps -#include -// use openmp to launch multiple threads -#include -#include -#include - -// simple util to print result -void Print_(mshadow::Tensor ts) { - for (mshadow::index_t i = 0; i < ts.size(0); ++i) { - for (mshadow::index_t j = 0; j < ts.size(1); ++j) { - printf("%g ", ts[i][j]); - } - printf("\n"); - } -} -template -inline void Print(mshadow::Tensor ts) { - mshadow::TensorContainer tmp; - tmp.Resize(ts.shape_); - mshadow::Copy(tmp, ts); - Print_(tmp); -} - -// this function is runed by specific thread -template -inline void RunWorkerThread(int devid, - mshadow::ps::ISharedModel *ps) { - // initialize tensor engine - mshadow::InitTensorEngine(devid); - mshadow::Stream *stream = mshadow::NewStream(); - // allocate tensor on xpu - mshadow::TensorContainer data(mshadow::Shape2(2, 3)); - // set the computation stream to the new allocated stream - // this will make subsequent computation whose target is data - // to use the stream, stream is needed for async execution in GPU - data.set_stream(stream); - // assume these operations sets the content of dataient - data[0] = 1.0f; - data[1] = devid + data[0]; - printf("dev%d: before sync, data:\n", devid); - // use print to show result, do not call - // print normally since Copy will block - Print(data); - printf("====================\n"); - // intiaialize the key, register the shape on parameter server - ps->InitKey(data[0].shape_, 0, devid); - ps->InitKey(data[1].shape_, 1, devid); - // push data[0] out, for update, or aggregation - // 0 is the key of the data, devid is the current device id - ps->Push(data[0], 0, devid); - // pull request is used to request the data to be copied back - // once computation is done - ps->PullReq(data[0], 0, devid); - // computation can be done here.. - // the pull request handler will be overlapped with - // similar as previous call - ps->Push(data[1], 1, devid); - ps->PullReq(data[1], 1, devid); - // more computation can be done here... - // the computation will be overlapped - // PullWait will block until these request finishes - ps->PullWait(0, devid); - ps->PullWait(1, devid); - printf("dev%d: after sync, data:\n", devid); - // use print to show result, do not call - // print normally since Copy will block - Print(data); - printf("====================\n"); - mshadow::DeleteStream(stream); -} - -namespace mshadow { -namespace ps { -// model updater is used when update is happening on server side -// if we only use parameter server for sum aggregation -// this is not needed, but we must declare this function to return NULL -template<> -IModelUpdater *CreateModelUpdater(void) { - return NULL; -} -} -} - -template -inline int Run(int argc, char *argv[]) { - if (argc < 2) { - printf("Usage: device list\n"\ - "\tfor CPU the device list can be arbitrary\n"\ - "\tfor GPU the device list need to be actual device index\n"); - return 0; - } - // list of device ids - std::vector devs; - // initialization - for (int i = 1; i < argc; ++i) { - // record the device id - devs.push_back(atoi(argv[i])); - } - mshadow::ps::ISharedModel - *ps = mshadow::ps::CreateSharedModel("local"); - // intiaialize the ps - ps->Init(devs); - // use openmp to launch #devs threads - #pragma omp parallel num_threads(devs.size()) - { - int tid = omp_get_thread_num(); - RunWorkerThread(devs[tid], ps); - } - delete ps; - return 0; -} diff --git a/mshadow-ps/example/local_sum.cpp b/mshadow-ps/example/local_sum.cpp deleted file mode 100644 index 7f0eed0df42e..000000000000 --- a/mshadow-ps/example/local_sum.cpp +++ /dev/null @@ -1,4 +0,0 @@ -#include "./local_sum-inl.h" -int main(int argc, char *argv[]) { - return Run(argc, argv); -} diff --git a/mshadow-ps/example/local_sum.cu b/mshadow-ps/example/local_sum.cu deleted file mode 100644 index 6e839601a265..000000000000 --- a/mshadow-ps/example/local_sum.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "./local_sum-inl.h" -int main(int argc, char *argv[]) { - return Run(argc, argv); -} From a9d340d303a9b7ae49754e5cda6ce4f168d1e37e Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 13 Feb 2015 20:56:51 -0800 Subject: [PATCH 146/147] new version --- CHANGES.md | 12 ++++++++++++ README.md | 7 +++++++ 2 files changed, 19 insertions(+) create mode 100644 CHANGES.md diff --git a/CHANGES.md b/CHANGES.md new file mode 100644 index 000000000000..03bb16936acd --- /dev/null +++ b/CHANGES.md @@ -0,0 +1,12 @@ +Change Log +===== + +mshadow-1.0 +===== +* Initial release + +mshadow-2.0: in progress +===== +* Support multiple data type +* Great refactoring of code +* Parameter server interface for MultiGPU and distributed learning diff --git a/README.md b/README.md index a2db5af76264..c19420f7e4a3 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,13 @@ Features * Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required. * MultiGPU and Distributed ML: mshadow-ps interface allows user to write efficient MultiGPU and distributed programs in an unified way. +Version +====== +* This version mshadow-2.x, there are a lot of changes in the interface and it is not backward compatible with mshadow-1.0 + - If you use older version of cxxnet, you will need to use the legacy mshadow code +* For legacy code, refer to [Here](https://github.com/tqchen/mshadow/releases/tag/v1.1) +* Change log in [CHANGES.md](CHANGES.md) + Related Projects ===== * [CXXNET: large-scale deep learning backed by mshadow](https://github.com/antinucleon/cxxnet) From 58fd51dc15a8a9547176d6ba7867d587f73cab0d Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 13 Feb 2015 21:35:01 -0800 Subject: [PATCH 147/147] doc changes --- doc/Doxyfile | 6 ++--- doc/README.md | 2 +- doc/mkdoc.sh | 6 ++--- guide/mshadow-ps/README.md | 1 + mshadow-ps/ps.h | 21 ++++++++++++++--- mshadow/base.h | 47 ++++++++++++++++++++++++-------------- mshadow/expression.h | 10 ++++++-- mshadow/io.h | 2 +- mshadow/random.h | 4 ++++ mshadow/tensor.h | 12 ++++++---- mshadow/tensor_container.h | 5 +++- 11 files changed, 81 insertions(+), 35 deletions(-) diff --git a/doc/Doxyfile b/doc/Doxyfile index 38bd831fa338..f3cc429213c9 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -8,7 +8,7 @@ PROJECT_NAME = "mshadow" PROJECT_NUMBER = PROJECT_BRIEF = PROJECT_LOGO = -OUTPUT_DIRECTORY = ../doc +OUTPUT_DIRECTORY = doc CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English BRIEF_MEMBER_DESC = YES @@ -95,13 +95,13 @@ WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- -INPUT = +INPUT = mshadow mshadow-ps INPUT_ENCODING = UTF-8 FILE_PATTERNS = RECURSIVE = NO EXCLUDE = EXCLUDE_SYMLINKS = NO -EXCLUDE_PATTERNS = *-inl.* +EXCLUDE_PATTERNS = *-inl.* utils.h thread_util.h thread.h kv_array.h EXCLUDE_SYMBOLS = mshadow::expr::Plan* mshadow::expr::*Engine* EXAMPLE_PATH = EXAMPLE_PATTERNS = diff --git a/doc/README.md b/doc/README.md index 03506565fcb7..9ea6172f37a7 100644 --- a/doc/README.md +++ b/doc/README.md @@ -5,7 +5,7 @@ This is the documentation for mshadow: A Lightweight CPU/GPU Matrix/Tensor Templ ### Links to Topics * [Tutorial](../guide) -* API Documentation +* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc) - You can run ```./mkdoc.sh``` to make the document locally * [Tutorial about Expression Template](../guide/exp-template) * [Writing Multi-GPU and Distributed ML](../guide/mshadow-ps) diff --git a/doc/mkdoc.sh b/doc/mkdoc.sh index 2c4b038106c1..3ee3d71b8ce8 100755 --- a/doc/mkdoc.sh +++ b/doc/mkdoc.sh @@ -1,4 +1,4 @@ #!/bin/bash -cd ../mshadow -doxygen ../doc/Doxyfile -cd ../doc +cd .. +doxygen doc/Doxyfile +cd doc diff --git a/guide/mshadow-ps/README.md b/guide/mshadow-ps/README.md index 263935a28700..3a95798aae98 100644 --- a/guide/mshadow-ps/README.md +++ b/guide/mshadow-ps/README.md @@ -7,6 +7,7 @@ This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in an ***easy*** and ***unified*** way. ####List of Resources +* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc/namespacemshadow_1_1ps.html) * [Library Interface Header](../../mshadow-ps/ps.h) * Tutorial in this page diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h index 4d9c2469a60a..6e6b08d2bd64 100644 --- a/mshadow-ps/ps.h +++ b/mshadow-ps/ps.h @@ -22,6 +22,7 @@ #endif namespace mshadow { +/*! \brief namespace of mshadow-ps */ namespace ps { /*! * \brief interface of parameter server @@ -129,7 +130,20 @@ class ISharedModel { devid, priority, callback, callback_arg); } #if __cplusplus >= 201103L - template + /*! + * \brief send a pull request, to pull parameter into data + * this call is asynchronize and returns immediately + * use PullWait to wait the event of copy finish + * this is the c++11 version that allows lambda function as callback + * \param data the data + * \param key the unique key to indicate the tensor, + * this is unique per device + * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + * \param callback the callback function + */ + template inline void PullReq(Tensor data, int key, int devid, @@ -249,8 +263,7 @@ class IModelUpdater { * \brief update the model, user can implement this one * to take advantage of tensor operations * \param key the key of data we point to - * \param dptr the data pointer - * \param size size of the parameter key + * \param data the tensor data corresponding to the data we want to initialize */ virtual void Update_(int key, Tensor data) { utils::Error("InitModel: not implemented"); @@ -273,6 +286,8 @@ namespace ps { /*! * \brief create a parameter server implementation * \param type the type of paramerver server + * can either be "local" or "dist" + * \return the ISharedModel that can be used to synchronize weights */ template inline ISharedModel *CreateSharedModel(const char *type) { diff --git a/mshadow/base.h b/mshadow/base.h index 6a10c1f18a0a..6336dfa023bc 100644 --- a/mshadow/base.h +++ b/mshadow/base.h @@ -283,6 +283,30 @@ struct divto { } // namespace sv /*! \brief namespace for potential reducer operations */ namespace red { +namespace limits { +/*! + * \brief minimum value of certain types + * \tparam DType data type + */ +template +MSHADOW_XINLINE DType MinValue(void); +/*! \brief minimum value of float */ +template<> +MSHADOW_XINLINE float MinValue(void) { + return -FLT_MAX; +} +/*! \brief minimum value of double */ +template<> +MSHADOW_XINLINE double MinValue(void) { + return -DBL_MAX; +} +/*! \brief minimum value of int */ +template<> +MSHADOW_XINLINE int MinValue(void) { + return INT_MIN; +} +} // namespace limits + /*! \brief sum reducer */ struct sum { /*! \brief do reduction into dst */ @@ -298,28 +322,14 @@ struct sum { MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) { return 1; } + /*! + *\brief set the initial value during reduction + */ template MSHADOW_XINLINE static void SetInitValue(DType &initv) { initv = 0; } }; -/*! \brief helper namespace to get the limits */ -namespace limits { - template - MSHADOW_XINLINE DType MinValue(void); - template<> - MSHADOW_XINLINE float MinValue(void) { - return -FLT_MAX; - } - template<> - MSHADOW_XINLINE double MinValue(void) { - return -DBL_MAX; - } - template<> - MSHADOW_XINLINE int MinValue(void) { - return INT_MIN; - } -} // namespace limits /*! \brief maximum reducer */ struct maximum { /*! \brief do reduction into dst */ @@ -336,6 +346,9 @@ struct maximum { MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) { return redres == redsrc ? 1: 0; } + /*! + *\brief set the initial value during reduction + */ template MSHADOW_XINLINE static void SetInitValue(DType &initv) { initv = limits::MinValue(); diff --git a/mshadow/expression.h b/mshadow/expression.h index c45568274aa2..d73c11f7d40c 100644 --- a/mshadow/expression.h +++ b/mshadow/expression.h @@ -96,6 +96,7 @@ template struct TypecastExp: public Exp, DstDType, etype> { + /*! \brief expression to be typecasted */ const EType &exp; /*! \brief constructor */ explicit TypecastExp(const EType &e) : exp(e) {} @@ -141,14 +142,17 @@ class RValueExp: public Exp { ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } + /*! \brief operator overload */ inline Container &operator-=(DType s) { ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } + /*! \brief operator overload */ inline Container &operator*=(DType s) { ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); } + /*! \brief operator overload */ inline Container &operator/=(DType s) { ExpEngine::Eval(this->ptrself(), scalar(s)); return *(this->ptrself()); @@ -164,7 +168,7 @@ class RValueExp: public Exp { ExpEngine::Eval(this->ptrself(), exp.self()); return *(this->ptrself()); } - // declar but not implement the assign to self type + /*! \brief operator overload, assign */ inline Container &__assign(const Exp &exp); /*! \brief implementation of operator+= */ template @@ -266,10 +270,11 @@ MakeExp(const Exp &lhs, const Exp &rhs) { return BinaryMapExp(lhs.self(), rhs.self()); } -/*! +/*! * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression * \param lhs left operand * \param rhs right operand + * \return the result expression * \tparam binary operator * \tparam TA lhs expression * \tparam ta lhs expression type @@ -334,6 +339,7 @@ MakeExp(const Exp &src) { /*! * \brief short hand for MakeExp, usage F(src), create a unary operation expression * \param src source expression + * \return the result expression * \tparam operator * \tparam TA source expression * \tparam ta source expression type diff --git a/mshadow/io.h b/mshadow/io.h index 32a3dd50842c..5a298198123e 100644 --- a/mshadow/io.h +++ b/mshadow/io.h @@ -1,6 +1,6 @@ /*! * Copyright (c) 2014 by Contributors - * \file tensor_io.h + * \file io.h * \brief definitions of I/O functions for mshadow tensor * \author Tianqi Chen */ diff --git a/mshadow/random.h b/mshadow/random.h index d9e9d86acdcb..5213a69571f6 100644 --- a/mshadow/random.h +++ b/mshadow/random.h @@ -97,6 +97,7 @@ class Random { * since second call of gaussian(s2) makes gaussian(s1) invalid * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression * \param shape shape of the tensor + * \return a temporal expression storing standard gaussian random variables * \tparam dim dimension of tensor */ template @@ -114,6 +115,7 @@ class Random { * since second call of gaussian(s2) makes gaussian(s1) invalid * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression * \param shape shape of the tensor + * \return a temporal expression storing standard uniform [0,1) * \tparam dim dimension of tensor */ template @@ -293,6 +295,7 @@ class Random { * \param shape shape of the tensor * \param mu mean * \param sigma variance + * \return a temporal expression storing standard gaussian random variables * \tparam dim dimension of tensor */ template @@ -313,6 +316,7 @@ class Random { * since second call of gaussian(s2) makes gaussian(s1) invalid * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression * \param shape shape of the tensor + * \return a temporal expression storing standard uniform [0,1) * \tparam dim dimension of tensor */ template diff --git a/mshadow/tensor.h b/mshadow/tensor.h index 619d66758729..773094dd4637 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -69,7 +69,10 @@ struct Shape { MSHADOW_XINLINE const index_t &operator[](index_t idx) const { return shape_[idx]; } - /*! \return whether two shape equals */ + /*! + * \return whether two shape equals + * \param s the shape to compare against + */ MSHADOW_XINLINE bool operator==(const Shape &s) const { #pragma unroll for (int i = 0; i < kDimension; ++i) { @@ -308,11 +311,11 @@ struct Tensor: public TRValue, } /*! * \brief return size of i-th dimension, start counting from highest dimension - * \param the dimension count from the highest dimensin + * \param idx the dimension count from the highest dimensin * \return the size */ - MSHADOW_XINLINE index_t size(index_t i) const { - return shape_[i]; + MSHADOW_XINLINE index_t size(index_t idx) const { + return shape_[idx]; } /*! * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together @@ -358,6 +361,7 @@ struct Tensor: public TRValue, operator=(const expr::Exp &exp) { return this->__assign(exp); } + /*!\brief functions to fit expression template */ inline Tensor &operator=(const DType &exp) { return this->__assign(exp); } diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index 5d8aeef30a9e..dbf250ceed28 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -109,20 +109,23 @@ class TensorContainer: public Tensor { Copy(*this, tmp, &stream); mshadow::FreeSpace(&tmp); } - // functions to fit exp template + /*!\brief functions to fit expression template */ inline Tensor &operator=(DType s) { return this->__assign(s); } + /*!\brief functions to fit expression template */ template inline Tensor & operator=(const expr::Exp &exp) { return this->__assign(exp); } + /*!\brief functions to fit expression template */ template inline Tensor & operator=(const expr::Exp &exp) { return this->__assign(exp); } + /*!\brief functions to fit expression template */ template inline Tensor & operator=(const expr::Exp &exp) {