diff --git a/CHANGES.md b/CHANGES.md new file mode 100644 index 000000000000..03bb16936acd --- /dev/null +++ b/CHANGES.md @@ -0,0 +1,12 @@ +Change Log +===== + +mshadow-1.0 +===== +* Initial release + +mshadow-2.0: in progress +===== +* Support multiple data type +* Great refactoring of code +* Parameter server interface for MultiGPU and distributed learning diff --git a/README.md b/README.md index 8cc278707b64..c19420f7e4a3 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,38 @@ mshadow: Matrix Shadow ====== -Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA +MShadow is a lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. The goal of mshadow is to support ***efficient***, +***device invariant*** and ***simple*** tensor library for machine learning project that aims for both simplicity and performance. -Creater: Bing Xu and Tianqi Chen - - -Documentation and Tutorial: https://github.com/tqchen/mshadow/wiki - -Description -===== -Most machine learning algorithms requires matrix,tensor operations frequently. For example, Eq.(1) is a common SGD update rule, where the weight can be a vector, matrix or 3D tensor. Eq.(2) is the backpropagtion rule: -``` -(1) weight = - eta * ( grad + lambda * weight ); -(2) gradin = dot( gradout, netweight.T() ); -``` - -These operations are not hard to implement, even in C++. The first one is elementwise operations, and can easily be written as -``` -for( int index = 0; index < weight.length; index ++ ){ - weight[index] = - eta * ( grad[index] + lambda * weight[index] ); -} -``` -Eq.(2) is matrix product, and we can use standard BLAS packages such as Intel MKL. It will looklike -``` -sgemm( CblasNoTrans, CblasTrans, n, m, k, 1.0, gradout.ptr, lda, netweight.ptr, ldb, 0.0, gradin.ptr, ldc ); -``` -However: - -* It is annoying to write these codes repeatively, and they are not intuitive. -* What if we want to port our code to GPU? We need to rewrite our code in CUDA - -mshadow is a unified C++/CUDA lib to to write Eq.(1) and Eq.(2) in C++, and *translate* them to the for loop and standard packages such as MKL, CuBLAS *in compile time*. +MShadow also provides interface that allows writing Multi-GPU and distributed deep learning programs in an easy and unified way. +* [Contributors](https://github.com/tqchen/mshadow/graphs/contributors) +* [Tutorial](guide) +* [Documentation](doc) +* [Parameter Server Interface for GPU Tensor](guide/mshadow-ps) Features ===== -* Shadow instead of giant: mshadow does not implement all of the functions, it is more of a wrapper to translated easy-to-read code to standard 'giant' packages such as MKL -* Whitebox instead of blackbox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called -* Unified CPU/GPU code: write a code and it should run in both CPU and GPU +* Efficient: all the expression you write will be lazily evaluated and compiled into optimized code + - No temporal memory allocation will happen for expression you write + - mshadow will generate specific kernel for every expression you write in compile time. +* Device invariant: you can write one code and it will run on both CPU and GPU +* Simple: mshadow allows you to write machine learning code using expressions. +* Whitebox: put a float* into the Tensor struct and take the benefit of the package, no memory allocation is happened unless explicitly called * Lightweight library: light amount of code to support frequently used functions in machine learning * Extendable: user can write simple functions that plugs into mshadow and run on GPU/CPU, no experience in CUDA is required. +* MultiGPU and Distributed ML: mshadow-ps interface allows user to write efficient MultiGPU and distributed programs in an unified way. +Version +====== +* This version mshadow-2.x, there are a lot of changes in the interface and it is not backward compatible with mshadow-1.0 + - If you use older version of cxxnet, you will need to use the legacy mshadow code +* For legacy code, refer to [Here](https://github.com/tqchen/mshadow/releases/tag/v1.1) +* Change log in [CHANGES.md](CHANGES.md) Related Projects ===== -* CXXNET: neural network implementation based on mshadow: https://github.com/antinucleon/cxxnet +* [CXXNET: large-scale deep learning backed by mshadow](https://github.com/antinucleon/cxxnet) +* [Parameter Server](https://github.com/mli/parameter_server) + - Parameter server project provides distributed back-end for mshadow-ps + - mshadow-ps extends original parameter server to support async updates for GPU Tensor diff --git a/doc/Doxyfile b/doc/Doxyfile index bef8089a3021..f3cc429213c9 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -8,7 +8,7 @@ PROJECT_NAME = "mshadow" PROJECT_NUMBER = PROJECT_BRIEF = PROJECT_LOGO = -OUTPUT_DIRECTORY = ../doc +OUTPUT_DIRECTORY = doc CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English BRIEF_MEMBER_DESC = YES @@ -95,13 +95,13 @@ WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- -INPUT = +INPUT = mshadow mshadow-ps INPUT_ENCODING = UTF-8 FILE_PATTERNS = RECURSIVE = NO EXCLUDE = EXCLUDE_SYMLINKS = NO -EXCLUDE_PATTERNS = *-inl.hpp +EXCLUDE_PATTERNS = *-inl.* utils.h thread_util.h thread.h kv_array.h EXCLUDE_SYMBOLS = mshadow::expr::Plan* mshadow::expr::*Engine* EXAMPLE_PATH = EXAMPLE_PATTERNS = diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 000000000000..9ea6172f37a7 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,321 @@ +MShadow Documentation +===== +This is the documentation for mshadow: A Lightweight CPU/GPU Matrix/Tensor Template Library in C++/CUDA. + +### Links to Topics + +* [Tutorial](../guide) +* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc) + - You can run ```./mkdoc.sh``` to make the document locally +* [Tutorial about Expression Template](../guide/exp-template) +* [Writing Multi-GPU and Distributed ML](../guide/mshadow-ps) +* [Compile Configuration script](../make) +* [Expression API](#expression-api) + - Expression api introduces the concept of expression in mshadow + +Expression API +===== +Expression is the key concept in mshadow, a common operation of mshadow is ```tensor = some code to construct expression``` + +There are three major types of expression: +* Mapper expression: only contain element-wise operations of Mapper expressions + - Mapper expression can used as composition component of other operations. + - Tensor, scalar are Mapper expressions + - Example: ``` weight = - eta * (grad + lambda * weight)``` is a Mapper expression. + - Mapper expressions are translated using expression template code implemented by mshadow. + - ***Assign safety***: Element-wise mapping are assign safe, which means, we can write ```A = A * 2 + B```, making lvalue appear in expression, the results are still correct. +* Chainer expression: may contain element-wise operation such as reduction and broadcast + - Example: ```dst = mirror(src)``` is a chainer expression + - ***Assign safety***: Most of the chainer extensions are not assignment safe, which means user should avoid putting target in source epression. +* Complex expression: complex operations, need special translation rule to translate to specific implementations. + - Complex expression can not be used as composition component of other operations. + - Example: ``` dot(lhs.T(), rhs)```, is complex expression, we can not write +``` dst = 1.0 + dot(lhs.T(), rhs)``` + - But limited syntax is supported depending on specification, for example, we do support ``` dst += 2.0f * dot(lhs.T(), rhs)``` + - Complex expressions are translated into specific implementations such as BLAS. + +### Element-wise Operations +The basic binary operators are overloaded to composite Mapper expressions, so we can write +```c++ +weight = (-eta) * (grad + lambda * weight); +``` +We can also use customized binary operators, and unary operators: +```c++ +struct maximum { + MSHADOW_XINLINE static float Map(float a, float b) { + return a > b ? a : b; + } +}; +template +void ExampleMaximum(Tensor out, + const Tensor &A, + const Tensor &B) { + out= 10.0f * F(A+1.0f, B); +} +struct sigmoid { + MSHADOW_XINLINE static float Map(float a) { + return 1.0f/(1.0f+expf(-a)); + } +}; +template +void ExampleSigmoid(Tensor out, const Tensor &in) { + // equivalent to out = sigmoid(in*2) + 1; + out = F(F(in * 2.0f), ScalarExp(1.0f)); +} +``` +### Matrix Multiplications +Matrix multiplications are supported by following syntax, with things brackets [] are optional +``` +dst [scale*] dot(lhs [.T()] , rhs [.T()]), can be =,+=,-= +``` +Example: +```c++ +template +void Backprop(Tensor gradin, + const Tensor &gradout, + const Tensor &netweight) { + gradin = 2.0 * dot(gradout, netweight.T()); +} +``` + +### Introducing Expression Extensions +Naming conventions: +* ```Tensor``` to refer to any Tensor with device any device and dimension. +* ```xpu```, ```dim```, are implicit template parameters. +* ```Expr``` will be used to refer to any mapper expression with type ```Tensor```. + +List of functions: +* [reshape](#reshape): reshapes a tensor to another shape, number of content must be same +* [broadcast](#broadcast): replicate a 1 dimension tensor in certain dimension +* [repmat](#repmat), special case of broadcast<0>: repeat vector over rows to form a matrix +* [sumall_except_dim](#sumall_except_dim): sum over all the dimensions, except the dimension specified in template parameter +* [sum_rows](#sum_rows): special case of sumall_except_dim<0>, sum of rows in the matrix +* [unpack_patch2col](#unpack_patch2col): unpack local (overlap) patches of image to column of mat, can be used to implement convolution +* [pack_col2patch](#pack_col2patch): reverse operation of unpack_patch2col, can be used to implement deconvolution +* [pool](#pool): do pooling on image +* [unpool](#unpool): get gradient of pooling result +* [crop](#crop): crop the original image to a smaller size +* [mirror](#mirror): get the mirrored result of input expression + +====== +##### reshape +* ```reshape(Expr src, Shape oshape)``` +* reshapes a tensor to another shape, total number of elements must be same +* parameters: + - src: input data + - oshape: target shape +* result expression type: ```Tensor``` with ```shape=oshape```, is Mapper expression +```c++ +void ExampleReshape(void) { + Tensor dst = NewTensor(Shape2(4, 5)); + Tensor src = NewTensor(Shape1(20), 1.0f); + dst = reshape(src, dst.shape_); + ... +} +``` +====== + +##### broadcast +* ```broadcast(Tensor src, Shape oshape)``` +* replicate a 1 dimension tensor certain dimension, specified by template parameter dimcast +* parameters: + - src: input 1 dimensional tensor + - oshape: shape of output +* return expression type: ```Tensor```, ```shape = oshape```, is Chainer expression +```c++ +void ExampleBroadcast(void) { + Tensor dst = NewTensor(Shape2(2, 3)); + Tensor src = NewTensor(Shape1(2), 1.0f); + src[0] = 2.0f; src[1] = 1.0f; + dst = broadcast<0>(src, dst.shape_); + // dst[0][0] = 2, dst[0][1] = 2; dst[1][0]=1, dst[1][1] = 1 + ... +} +``` +====== +##### repmat +* ```repmat(Tensor src, int nrows) ```` +* special case of broadcast, repeat 1d tensor over rows +* input parameters: + - src: input vector + - nrows: number of rows in target +* return expression type: ```Tensor```, with ```shape=(nrows, src.size(0))```, is Chainer expression +```c++ +void ExampleRepmat(void) { + Tensor dst = NewTensor(Shape2(3, 2)); + Tensor src = NewTensor(Shape1(2), 1.0f); + src[0] = 2.0f; src[1] = 1.0f; + dst = repmat(src, 3); + // dst[0][0] = 2, dst[0][1] = 1; dst[1][0]=2, dst[1][1] = 1 + ... +} +``` +====== +##### sumall_except_dim +* ```sumall_except_dim(Expr src) ```` +* sum over all dimensions, except dimkeep +* input parameters: + - src: input mapper expression +* return expression type: ```Tensor```, with ```shape=(src.size(dimkeep))```, is Complex expression +* Syntax: ```dst [sv] [scale*] sumall_except_dim(src) , can be =, +=, -=, *=, /=```` +```c++ +void ExampleSumAllExceptDim(void) { + Tensor src = NewTensor(Shape3(2, 3, 2), 1.0f); + Tensor dst = NewTensor(Shape1(3), 1.0f); + dst += sum_all_except<1>(src * 2.0f); + // dst[0] = 1.0 + 4.0 *2.0 = 9.0 + ... +} +``` +====== +##### sum_rows +* ```sum_rows(Expr src) ```` +* sum of rows in the matrix +* input parameters: + - src: input mapper expression +* return expression type: ```Tensor```, with ```shape=(src.size(0))```, is Complex expression +* Syntax: ```dst [sv] [scale*] sum_rows(src) , can be =,+=,-=,*=,/=```` +```c++ +void ExampleSumRows(void) { + Tensor src = NewTensor(Shape2(3, 2), 1.0f); + Tensor dst = NewTensor(Shape1(2), 1.0f); + dst += sum_rows(src + 1.0f); + // dst[0] = 1.0 + 3.0 *(1.0+1.0) = 7.0 + ... +} +``` +====== +##### unpack_patch2col +* ```unpack_patch2col(Expr img, int psize_y, int p_size_x, int pstride) ```` +* unpack local (overlap) patches of image to column of mat, can be used to implement convolution, after getting unpacked mat, we can use: ```output = dot(weight, mat)``` to get covolved results, the relations: + - weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x + - output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images + - out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0 + - out_width = (in_width - psize_x) / pstride + 1 +* input parameters: + - img: source image, can be expression; (in_channels, in_height, in_width) + - psize_y height of each patch + - psize_x width of each patch + - pstride: stride of each patch +* return expression type: ```Tensor```, with ```shape=(in_channel*psize*psize, out_height*out_width)```, is Chainer expression +```c++ +void ExampleCovolution(Tensor dst, Tensor src, + Tensor weight, int ksize, int stride) { + int o_height = (src.size(1)- ksize) / stride + 1; + int o_width = (src.size(2)- ksize) / stride + 1; + utils::Assert(weight.size(0) == src.size(0) * ksize * ksize); + TensorContainer tmp_col(Shape2(src.size(0) * ksize * ksize, + o_height * o_width)); + TensorContainer tmp_dst(Shape2(weight.size(0), + o_height * o_width)); + tmp_col = unpack_patch2col(src, ksize, ksize, stride); + tmp_dst = dot(weight, tmp_col); + dst = reshape(tmp_dst, dst.shape_); +} +``` + +====== +##### pack_col2patch +* ```pack_col2patch(Tensor mat, Shape<3> imshape, int psize_y, int psize_x, int pstride) ```` +* reverse operation of unpack_patch2col, can be used to implement deconvolution +* input parameters: + - mat: source mat, same shape as output of unpack_patch2col + - imshape: shape of target image + - psize_y height of each patch + - psize_x width of each patch + - pstride: stride of each patch +* return expression type: ```Tensor```, with ```shape = imshape```, is Chainer expression +```c++ +void ExampleDecovolution(Tensor bottom, Tensor top, + Tensor weight, int ksize, int stride) { + int o_height = (bottom.size(1)- ksize) / stride + 1; + int o_width = (bottom.size(2)- ksize) / stride + 1; + utils::Assert(weight.size(0) == bottom.size(0) * ksize * ksize); + TensorContainer tmp_col(Shape2(bottom.size(0) * ksize * ksize, + o_height * o_width)); + TensorContainer tmp_dst(Shape2(weight.size(0), o_height*o_width)); + tmp_dst = reshape(top, tmp_dst.shape_); + tmp_col = dot(weight.T(), tmp_dst); + bottom = pack_col2patch(tmp_col, bottom.shape_, ksize, ksize, stride); +} +``` + +====== +##### pool +* ```pool(Expr img, [Shape<2> pshape,] int ksize_y, int ksize_x, int kstride)``` +* Pooling on image with specify kernel size and stride, can be used to implement max pooilng and other pooling layer +* input parameters: + - Reducer: operation can be max or sum + - img: source image, can be expression; (in_channels, in_height, in_width) + - [optional] Shape<2> pshape, output shape + - ksize_y height of each patch + - ksize_x width of each patch + - kstride: stride of each patch +* return expression: ```Expr```, with ```shape = (in_channel, (out_height - ksize) / kstride + 1, (out_width - ksize) / kstride + 1)```, or expression in pshape + - Chainer expression +```c++ +void ExampleMaxPooling(TensorContainer &data, int ksize, int stride) { + TensorContainer pooled(Shape3(data.size(0), + (data.size(2) - ksize) / kstride + 1), + (data.size(1) - ksize) / kstride + 1)); + pooled = pool(data, ksize, ksize, stride); +} +``` + +====== +##### unpool +* ```unpool(Tensor data_src, Tensor data_pooled, Tensor grad_pooled, int ksize_y, int ksize_x, int kstride)``` +* Unpooling on image with specify kernel size and stride, can be used to implement backprop of max pooilng and other pooling layer +* input parameters: + - Reducer: operation can be max or sum + - data_src: source image batch. + - data_pooled: pooled image batch. + - grad_pooled: gradient of upper layer + - ksize_y height of each patch + - ksize_x width of each patch + - kstride: stride of each patch +* return: + Expression, same shape to data_src +```c++ +void ExampleMaxUnpooling(Tensor &data_src, Tensor &data_pooled, + Tensor &grad_pooled, int ksize, int kstride) { + TensorContainer grad(data_src.shape_); + grad = unpool(data_src, data_pooled, + grad_pooled, ksize, ksize, kstride); +} +``` + +====== +##### crop +* ```crop(Expr src, Shape<2> oshape, int start_height, int start_width)``` +* input parameters: + - src: input expression + - oshape: output shape after crop + - start_height: start height for cropping + - start_width: start width for cropping +* Can also be ```crop(Expr src, Shape<2> oshape)``` where the crop will happen in center. +* return + - cropped expression +```c++ +void ExampleCrop(TensorContainer img, int start_height, int start_width) { + TensorContainer cropped(Shape3(img.size(0), + img.size(1) - start_height, + img.size(2) - start_width)); + cropped = crop(img, start_height, start_width); +} +``` + +====== +##### mirror +* ```mirrow(Expr src)``` +* input: + - src, source expression to be mirrored +* output: + - expression of mirrored result +```c++ +void ExampleMirror(TensorContainer img) { + TensorContainer mirrored(img.shape_); + mirrored = mirror(img); +} +``` + diff --git a/doc/mkdoc.sh b/doc/mkdoc.sh index 2c4b038106c1..3ee3d71b8ce8 100755 --- a/doc/mkdoc.sh +++ b/doc/mkdoc.sh @@ -1,4 +1,4 @@ #!/bin/bash -cd ../mshadow -doxygen ../doc/Doxyfile -cd ../doc +cd .. +doxygen doc/Doxyfile +cd doc diff --git a/example/Makefile.openblas b/example/Makefile.openblas deleted file mode 100644 index bd90eca3922a..000000000000 --- a/example/Makefile.openblas +++ /dev/null @@ -1,37 +0,0 @@ -# set LD_LIBRARY_PATH -# echo "Link mshadow with precomplied Openblas" -export OPENBLAS_ROOT=../../OpenBLAS-v0.2.13-Win64-int32 -export CC = gcc -export CXX = g++ -export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -I$(OPENBLAS_ROOT)/include -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1 -D__APPLE__ -export LDFLAGS= -static -lpthread -lopenblas -L$(OPENBLAS_ROOT)/lib -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) - -# specify tensor path -BIN = basic defop basic-matrix-dot -OBJ = -CUOBJ = -CUBIN = -.PHONY: clean all - -all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) - -basic: basic.cpp -defop: defop.cpp -basic-matrix-dot: basic-matrix-dot.cpp - -$(BIN) : - $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) - -$(OBJ) : - $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) - -$(CUOBJ) : - $(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^) - -$(CUBIN) : - $(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^) - -clean: - $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ diff --git a/example/basic-matrix-dot.cpp b/example/basic-matrix-dot.cpp deleted file mode 100644 index 5c5485beb238..000000000000 --- a/example/basic-matrix-dot.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// header file to use mshadow -#include "mshadow/tensor.h" -// this namespace contains all data structures, functions -using namespace mshadow; -// this namespace contains all operator overloads -using namespace mshadow::expr; - -int main( void ){ - // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); - - Tensor mat = NewTensor( Shape2(1000,1000), 1.0 ); - for (int i=0;i<100;i++) - mat = dot(mat, mat); - FreeSpace(mat); - // shutdown tensor enigne after usage - - ShutdownTensorEngine(); - return 0; -} diff --git a/example/basic.cpp b/example/basic.cpp deleted file mode 100644 index 53f85ae1a262..000000000000 --- a/example/basic.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// header file to use mshadow -#include "mshadow/tensor.h" -// this namespace contains all data structures, functions -using namespace mshadow; -// this namespace contains all operator overloads -using namespace mshadow::expr; - -int main( void ){ - // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); - // assume we have a float space - real_t data[ 20 ]; - // create a 2 x 5 x 2 tensor, from existing space - Tensor ts( data, Shape3(2,5,2) ); - // take first subscript of the tensor - Tensor mat = ts[0]; - // Tensor object is only a handle, assignment means they have same data content - Tensor mat2 = mat; - - // shape of matrix, note shape order is different from numpy - // shape[i] indicate the shape of i-th dimension - printf("%u X %u matrix\n", mat.shape[1], mat.shape[0] ); - - // initialize all element to zero - mat = 0.0f; - // assign some values - mat[0][1] = 1.0f; mat[1][0] = 2.0f; - // elementwise operations - mat += ( mat + 10.0f ) / 10.0f + 2.0f; - - // print out matrix, note: mat2 and mat1 are handles(pointers) - for( index_t i = 0; i < mat.shape[1]; i ++ ){ - for( index_t j = 0; j < mat.shape[0]; j ++ ){ - printf("%.2f ", mat2[i][j]); - } - printf("\n"); - } - // shutdown tensor enigne after usage - ShutdownTensorEngine(); - return 0; -} diff --git a/example/defop.cpp b/example/defop.cpp deleted file mode 100644 index 990f4728bed7..000000000000 --- a/example/defop.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include -// header file to use mshadow -#include "mshadow/tensor.h" -// this namespace contains all data structures, functions -using namespace mshadow; -// this namespace contains all operator overloads -using namespace mshadow::expr; - -// user defined unary operator addone -struct addone{ - MSHADOW_XINLINE static real_t Map(real_t a) { - return a + 1.0f; - } -}; -// user defined binary operator max of two -struct maxoftwo{ - MSHADOW_XINLINE static real_t Map(real_t a,real_t b) { - if( a > b ) return a; - else return b; - } -}; - -int main( void ){ - // intialize tensor engine before using tensor operation, needed for CuBLAS - InitTensorEngine(); - // take first subscript of the tensor - Tensor mat = NewTensor( Shape2(2,3), 0.0f ); - Tensor mat2= NewTensor( Shape2(2,3), 0.0f ); - - mat[0][0] = -2.0f; - mat = F( F( mat ) + 1.0f, mat2 ); - - for( index_t i = 0; i < mat.shape[1]; i ++ ){ - for( index_t j = 0; j < mat.shape[0]; j ++ ){ - printf("%.2f ", mat[i][j]); - } - printf("\n"); - } - - FreeSpace( mat ); FreeSpace( mat2 ); - // shutdown tensor enigne after usage - ShutdownTensorEngine(); - return 0; -} diff --git a/example/exp-template/README.md b/example/exp-template/README.md deleted file mode 100644 index 8c30a2998c2a..000000000000 --- a/example/exp-template/README.md +++ /dev/null @@ -1,4 +0,0 @@ -This folder is not example of mshadow code. -It is example code introducing expression template, the trick behind mshadow. - -See: https://github.com/tqchen/mshadow/wiki/Expression-Template diff --git a/example/exp-template/exp_lazy.cpp b/example/exp-template/exp_lazy.cpp deleted file mode 100644 index 91f49b4fca78..000000000000 --- a/example/exp-template/exp_lazy.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Example Lazy evaluation code -// for simplicity, we use struct and make all members public -#include -struct Vec; -// expression structure holds the expression -struct BinaryAddExp{ - const Vec& lhs; - const Vec& rhs; - BinaryAddExp(const Vec& lhs, const Vec& rhs):lhs(lhs),rhs(rhs){} -}; -// no constructor and destructor to allocate and de-allocate memory, allocation done by user -struct Vec { - int len; - float* dptr; - Vec (void){} - Vec (float *dptr, int len):len(len),dptr(dptr){} - // here is where evaluation happens - inline Vec& operator= (const BinaryAddExp& src){ - for( int i = 0; i < len; ++i ){ - dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i]; - } - return *this; - } -}; -// no evaluation happens here -inline BinaryAddExp operator+ (const Vec& lhs, const Vec& rhs){ - return BinaryAddExp(lhs, rhs); -} - -const int n = 3; -int main( void ){ - float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5}; - Vec A(sa,n), B(sb,n), C(sc,n); - // run expression - A = B + C; - for( int i = 0; i < n; ++ i ){ - printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i] ); - } - return 0; -} diff --git a/example/exp-template/exp_template.cpp b/example/exp-template/exp_template.cpp deleted file mode 100644 index d9ec4622f706..000000000000 --- a/example/exp-template/exp_template.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// Example code, expression template, and more length equations -// for simplicity, we use struct and make all members public - -#include - -// this is expression, all expressions must inheritate it, and put their type in subtype -template -struct Exp{ - // returns const reference of the actual type of this expression - inline const SubType& self(void) const{ - return *static_cast(this); - } -}; - -// binary add expression -// note how it is inheritates from Exp -// and put its own type into the template argument -template -struct BinaryAddExp: public Exp< BinaryAddExp >{ - const TLhs& lhs; - const TRhs& rhs; - BinaryAddExp(const TLhs& lhs, const TRhs& rhs):lhs(lhs),rhs(rhs){} - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return lhs.Eval(i) + rhs.Eval(i); - } -}; -// no constructor and destructor to allocate and de-allocate memory, allocation done by user -struct Vec: public Exp{ - int len; - float* dptr; - Vec (void){} - Vec (float *dptr, int len):len(len),dptr(dptr){} - // here is where evaluation happens - template - inline Vec& operator= (const Exp& src_){ - const EType &src = src_.self(); - for( int i=0; i < len; ++i ){ - dptr[i] = src.Eval(i); - } - return *this; - } - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return dptr[i]; - } -}; -// template add, works for any expressions -template -inline BinaryAddExp operator+ (const Exp& lhs, const Exp& rhs){ - return BinaryAddExp(lhs.self(), rhs.self()); -} - -const int n = 3; -int main( void ){ - float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5}; - Vec A(sa,n), B(sb,n), C(sc,n); - // run expression, this expression is longer:) - A = B + C + C; - for( int i = 0; i < n; ++ i ){ - printf("%d:%f==%f+%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i], C.dptr[i] ); - } - return 0; -} diff --git a/example/exp-template/exp_template_op.cpp b/example/exp-template/exp_template_op.cpp deleted file mode 100644 index 4399936b6981..000000000000 --- a/example/exp-template/exp_template_op.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Example code, expression template -// with binary operator definition and extension -// for simplicity, we use struct and make all members public - -#include - -// this is expression, all expressions must inheritate it, and put their type in subtype -template -struct Exp{ - // returns const reference of the actual type of this expression - inline const SubType& self(void) const{ - return *static_cast(this); - } -}; - -// binary operators -struct mul{ - inline static float Map(float a, float b){ - return a * b; - } -}; - -// binary add expression -// note how it is inheritates from Exp -// and put its own type into the template argument -template -struct BinaryMapExp: public Exp< BinaryMapExp >{ - const TLhs& lhs; - const TRhs& rhs; - BinaryMapExp(const TLhs& lhs, const TRhs& rhs):lhs(lhs),rhs(rhs){} - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return OP::Map( lhs.Eval(i), rhs.Eval(i) ); - } -}; -// no constructor and destructor to allocate and de-allocate memory, allocation done by user -struct Vec: public Exp{ - int len; - float* dptr; - Vec (void){} - Vec (float *dptr, int len):len(len),dptr(dptr){} - // here is where evaluation happens - template - inline Vec& operator= (const Exp& src_){ - const EType &src = src_.self(); - for( int i=0; i < len; ++i ){ - dptr[i] = src.Eval(i); - } - return *this; - } - // evaluation function, evaluate this expression at position i - inline float Eval( int i ) const{ - return dptr[i]; - } -}; -// template add, works for any expressions -template -inline BinaryMapExp F(const Exp& lhs, const Exp& rhs){ - return BinaryMapExp(lhs.self(), rhs.self()); -} - -template -inline BinaryMapExp operator* (const Exp& lhs, const Exp& rhs){ - return F(lhs, rhs); -} - -// user defined operation -struct maximum{ - inline static float Map(float a, float b){ - return a > b ? a : b; - } -}; - -const int n = 3; -int main( void ){ - float sa[n]={1,2,3},sb[n]={2,3,4},sc[n]={3,4,5}; - Vec A(sa,n), B(sb,n), C(sc,n); - // run expression, this expression is longer:) - A = B * F(C, B); - for( int i = 0; i < n; ++ i ){ - printf("%d:%f==%f*max(%f,%f)\n", i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i] ); - } - return 0; -} diff --git a/example/neuralnet/README.md b/example/neuralnet/README.md deleted file mode 100644 index fb5b59a3fb07..000000000000 --- a/example/neuralnet/README.md +++ /dev/null @@ -1,10 +0,0 @@ -This folder contains a mshadow example of simple neural net implementation - -To compile the code, type make: -* You will need to have CUDA and MKL installed. -* Alternatively, you can compile with CBLAS packages to replace MKL such as BLAS or ATLAS, type make blas=1 - -To run the demo, download MNIST dataset from: http://yann.lecun.com/exdb/mnist/ -unzip all the files into current folder - -and run by ./nnet cpu or ./nnet gpu. ./convnet cpu or ./convnet gpu diff --git a/example/neuralnet/build_openblash.sh b/example/neuralnet/build_openblash.sh deleted file mode 100644 index dd33f2cbc07c..000000000000 --- a/example/neuralnet/build_openblash.sh +++ /dev/null @@ -1,3 +0,0 @@ -mv nnet.cu nnet.cpp -mv convnet.cu convnet.cpp -make -f Makefile.openblas \ No newline at end of file diff --git a/example/neuralnet/convnet.cu b/example/neuralnet/convnet.cu deleted file mode 100644 index de8f65b5568b..000000000000 --- a/example/neuralnet/convnet.cu +++ /dev/null @@ -1,259 +0,0 @@ -// this implements a simple convolution neural net: conv-maxpool-fullc -#include -// header file to use mshadow -#include "mshadow/tensor.h" -// helper function to load mnist dataset -#include "util.h" -// this namespace contains all data structures, functions -using namespace mshadow; -// this namespace contains all operator overloads -using namespace mshadow::expr; - -// define operations -struct relu{ - MSHADOW_XINLINE static real_t Map(real_t a) { - using namespace std; - return max( a, 0.0f ); - } -}; -struct relu_grad { - MSHADOW_XINLINE static real_t Map(real_t a) { - return a > 0.0f ? 1.0f : 0.0f; - } -}; - -/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ -class INNet{ -public: - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ) = 0; - virtual void Backprop( const Tensor& gradout ) = 0; - virtual void Update( void ) = 0; - virtual ~INNet(){} -}; - -/*! - * \brief simple two layer conv-net conv-pool-flat-fullc - * this implementation is device invariant - */ -template -class ConvNet : public INNet{ -public: - // initialize the network - ConvNet( int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out ) - :rnd(0), ksize(ksize), kstride(kstride), psize(psize){ - // setup nodes - ninput.Resize( Shape4( batch_size, 1, insize, insize ) ); - nhidden.Resize( Shape4( batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1) ); - nhiddenbak.Resize( nhidden.shape ); - npool.Resize( Shape4( batch_size, nchannel, (nhidden.shape[1]+1-psize)/psize, (nhidden.shape[0]+1-psize)/psize ) ); - npoolbak.Resize( npool.shape ); - nflat.Resize( Shape2( batch_size, npool.shape[2]*npool.shape[1]*npool.shape[0] ) ); - nout.Resize( Shape2( batch_size, num_out ) ); - // setup bias - hbias.Resize( Shape1( nchannel ) ); g_hbias.Resize( hbias.shape ); - obias.Resize( Shape1( num_out ) ); g_obias.Resize( obias.shape ); - hbias = 0.0f; obias = 0.0f; - // setup weights - Ki2h.Resize( Shape2( nchannel, ksize*ksize ) ); g_Ki2h.Resize( Ki2h.shape ); - Wh2o.Resize( Shape2( nflat.shape[0], num_out ) ); g_Wh2o.Resize( Wh2o.shape ); - rnd.SampleGaussian( Ki2h, 0, 0.01f ); - rnd.SampleGaussian( Wh2o, 0, 0.01f ); - - printf("conv=%d, pool=%d\n", nhidden.shape[0], npool.shape[0] ); - } - virtual ~ConvNet(){} - // forward propagation - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ){ - index_t batch_size = inbatch.shape[3]; - // copy data to input layer - Copy( ninput, inbatch ); - // first layer, conv, use stride=2 - ConvForward( ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst ); - // add bias - nhidden += broadcast<2>( hbias, nhidden.shape ); - // activation, relu, backup activation in nhidden - nhidden = F( nhidden ); - Copy( nhiddenbak, nhidden ); - // max pooling - npool = pool( nhiddenbak, npool[0][0].shape, psize, psize ); - Copy( npoolbak, npool ); - // flat - nflat = reshape( npool, nflat.shape ); - // second layer fullc - nout = dot( nflat, Wh2o ); - nout += repmat( obias, batch_size ); - // softmax calculation - Softmax( nout, nout ); - // copy result out - Copy( oubatch, nout ); - } - // back propagation - virtual void Backprop( const Tensor& gradout ){ - // copy gradient to output layer - Copy( nout, gradout ); - // calc grad of final layer - g_obias = sum_rows( nout ); - g_Wh2o = dot( nflat.T(), nout ); - // backprop to previous layer - nflat = dot( nout, Wh2o.T() ); - npool = reshape( nflat, npool.shape ); - // backprop pooling layer - nhiddenbak = unpool( nhiddenbak, npoolbak, npool, psize, psize ); - // calculate gradient of relu layer - nhidden = F( nhidden ) * nhiddenbak; - // calc grad of layer 1 - g_hbias = sumall_except_dim<2>( nhidden ); - ConvBackWard( nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst ); - } - // update weight - virtual void Update( void ){ - // run SGD - const float eta = 0.1; - const float wd = 0.00001; - // update weight - Ki2h -= eta * ( wd * Ki2h + g_Ki2h ); - Wh2o -= eta * ( wd * Wh2o + g_Wh2o ); - // no regularization for bias - hbias-= eta * g_hbias; - obias-= eta * g_obias; - } -private: - // forward convolution, tmp_col and tmp_dst are helper structure - inline static void ConvForward( const Tensor &in, const Tensor &kernel, Tensor &out, - int ksize, int kstride, - TensorContainer &tmp_col, TensorContainer& tmp_dst ){ - index_t oheight = (in.shape[1] - ksize)/kstride + 1; - index_t owidth = (in.shape[0] - ksize)/kstride + 1; - index_t nbatch = in.shape[3]; - index_t nchannel = out.shape[2]; - // we directly unpack all local patches and do a dot product - // this cost lots of memory, normally for large image, only unpack several image at a time - tmp_col.Resize( Shape2( in.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); - tmp_dst.Resize( Shape2( nchannel, nbatch*oheight*owidth ) ); - // unpack local patches , stride=1 - tmp_col = unpack_patch2col( in, ksize, kstride ); - tmp_dst = dot( kernel, tmp_col ); - // reshape, then swap axis, we chain equations together - out = swapaxis<2,3>( reshape( tmp_dst, Shape4( nchannel, nbatch, oheight, owidth ) ) ); - } - - // backward convolution, calculate gradient of kernel, and backprop back to in - inline static void ConvBackWard( const Tensor &out, const Tensor &kernel, - Tensor &g_kernel, Tensor &in, - int ksize, int kstride, - TensorContainer &tmp_col, TensorContainer& tmp_dst ){ - index_t oheight = (in.shape[1] - ksize)/kstride + 1; - index_t owidth = (in.shape[0] - ksize)/kstride + 1; - index_t nbatch = in.shape[3]; - index_t nchannel = out.shape[2]; - // we directly unpack all local patches and do a dot product - // this cost lots of memory, normally for large image, only unpack several image at a time - tmp_col.Resize( Shape2( in.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); - tmp_dst.Resize( Shape2( nchannel, nbatch*oheight*owidth ) ); - // unpack local patches - tmp_col = unpack_patch2col( in, ksize, kstride ); - tmp_dst = reshape( swapaxis<2,3>( out ), tmp_dst.shape ); - g_kernel = dot( tmp_dst, tmp_col.T() ); - // backpropgation: not necessary for first layer, but included anyway - tmp_col = dot( kernel.T(), tmp_dst ); - in = pack_col2patch( tmp_col, in.shape, ksize, kstride ); - } -private: - // random seed generator - Random rnd; - // kernel size, pooling size - int ksize, kstride, psize; - // nodes in neural net - TensorContainer ninput, nhidden, nhiddenbak, npool, npoolbak; - TensorContainer nflat, nout; - // temp helper structure - TensorContainer tmp_col, tmp_dst; - // hidden bias, gradient - TensorContainer hbias, obias, g_hbias, g_obias; - // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize) - TensorContainer Ki2h, Wh2o, g_Ki2h, g_Wh2o; -}; - -// helper function to get the max inde -inline int MaxIndex( Tensor pred ){ - int maxidx = 0; - for( index_t i = 1; i < pred.shape[0]; ++i ){ - if( pred[i] > pred[maxidx] ) maxidx = (int)i; - } - return maxidx; -} - -int main( int argc, char *argv[] ){ - if( argc < 2 ){ - printf("Usage: cpu or gpu\n"); return 0; - } - srand(0); - InitTensorEngine(); - - // settings - int batch_size = 100; - int insize = 28; - int nchannel = 10; - int ksize = 5; - int kstride = 1; - int psize = 2; - int num_out = 10; - - // choose which version to use - INNet *net; - if( !strcmp( argv[1], "gpu") ) { -#if DMSHADOW_USE_CUDA==1 - net = new ConvNet( batch_size, insize, nchannel, ksize, kstride, psize, num_out ); -#endif - }else{ - net = new ConvNet( batch_size, insize, nchannel, ksize, kstride, psize, num_out ); - } - - // temp output layer - TensorContainer pred; - pred.Resize( Shape2( batch_size, num_out ) ); - - // label - std::vector ytrain, ytest; - // data - TensorContainer xtrain_, xtest_; - LoadMNIST( "train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true); - LoadMNIST( "t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false); - - TensorContainer xtrain( Shape4(xtrain_.shape[1], 1, insize, insize) ); - TensorContainer xtest( Shape4(xtest_.shape[1], 1, insize, insize) ); - xtrain = reshape( xtrain_, xtrain.shape ); - xtest = reshape( xtest_, xtest.shape ); - - int num_iter = 20; - - for( int i = 0; i < num_iter; ++ i ){ - // training - for( index_t j = 0; j + batch_size <= xtrain.shape[3]; j += batch_size ){ - net->Forward( xtrain.Slice( j, j + batch_size ), pred ); - // set gradient into pred - for( int k = 0; k < batch_size; ++ k ){ - pred[k][ ytrain[k+j] ] -= 1.0f; - } - // scale gradient by batchs zie - pred *= 1.0f / batch_size; - // run backprop - net->Backprop( pred ); - // update net parameters - net->Update(); - } - // evaluation - long nerr = 0; - for( index_t j = 0; j + batch_size <= xtest.shape[3]; j += batch_size ){ - net->Forward( xtest.Slice( j, j + batch_size ), pred ); - for( int k = 0; k < batch_size; ++ k ){ - nerr += MaxIndex( pred[k] ) != ytest[j+k]; - - } - } - printf("round %d: test-err=%f\n", i, (float)nerr/xtest.shape[3] ); - } - delete net; - ShutdownTensorEngine(); - return 0; -} diff --git a/example/neuralnet/nnet.cu b/example/neuralnet/nnet.cu deleted file mode 100644 index a1b4dc2f67f5..000000000000 --- a/example/neuralnet/nnet.cu +++ /dev/null @@ -1,187 +0,0 @@ -// this implements a simple two layer neural net -#include -// header file to use mshadow -#include "mshadow/tensor.h" -// helper function to load mnist dataset -#include "util.h" -// this namespace contains all data structures, functions -using namespace mshadow; -// this namespace contains all operator overloads -using namespace mshadow::expr; - -// define sigmoid operation -struct sigmoid{ - MSHADOW_XINLINE static real_t Map(real_t a) { - return 1.0f/(1.0f+expf(-a)); - } -}; - -/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ -class INNet{ -public: - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ) = 0; - virtual void Backprop( const Tensor& gradout ) = 0; - virtual void Update( void ) = 0; - virtual ~INNet(){} -}; - -/*! - * \brief simple two layer neural net - * this implementation is device invariant - */ -template -class NNet : public INNet{ -public: - // initialize the network - NNet( int batch_size, int num_in, int num_hidden, int num_out ):rnd(0){ - // setup nodes - ninput.Resize( Shape2( batch_size, num_in ) ); - nhidden.Resize( Shape2( batch_size, num_hidden ) ); - nhiddenbak.Resize( nhidden.shape ); - nout.Resize( Shape2( batch_size, num_out ) ); - // setup bias - hbias.Resize( Shape1( num_hidden ) ); g_hbias.Resize( hbias.shape ); - obias.Resize( Shape1( num_out ) ); g_obias.Resize( obias.shape ); - hbias = 0.0f; obias = 0.0f; - // setup weights - Wi2h.Resize( Shape2( num_in, num_hidden ) ); g_Wi2h.Resize( Wi2h.shape ); - Wh2o.Resize( Shape2( num_hidden, num_out ) ); g_Wh2o.Resize( Wh2o.shape ); - rnd.SampleGaussian( Wi2h, 0, 0.01f ); - rnd.SampleGaussian( Wh2o, 0, 0.01f ); - - } - virtual ~NNet(){} - // forward propagation - virtual void Forward( const Tensor& inbatch, Tensor &oubatch ){ - // note: in mshadow, shape[0] means lowest dimension, shape[1] is number of rows in matrix - // this is different from numpy convention - index_t batch_size = inbatch.shape[1]; - // copy data to input layer - Copy( ninput, inbatch ); - // first layer, fullc - nhidden = dot( ninput, Wi2h ); - nhidden+= repmat( hbias, batch_size ); - // activation, sigmloid, backup activation in nhidden - nhidden = F( nhidden ); - Copy( nhiddenbak, nhidden ); - // second layer fullc - nout = dot( nhiddenbak, Wh2o ); - nout += repmat( obias, batch_size ); - // softmax calculation - Softmax( nout, nout ); - // copy result out - Copy( oubatch, nout ); - } - // back propagation - virtual void Backprop( const Tensor& gradout ){ - // copy gradient to output layer - Copy( nout, gradout ); - // calc grad of layer 2 - g_obias = sum_rows( nout ); - g_Wh2o = dot( nhiddenbak.T(), nout ); - // backprop to layer 1 - nhiddenbak = dot( nout, Wh2o.T() ); - // calculate gradient of sigmoid layer - nhidden = nhidden * (1.0f-nhidden) * nhiddenbak; - // calc grad of layer 1 - g_hbias = sum_rows( nhidden ); - g_Wi2h = dot( ninput.T(), nhidden ); - } - // update weight - virtual void Update( void ){ - // run SGD - const float eta = 0.8; - const float wd = 0.00001; - // update weight - Wi2h -= eta * ( wd * Wi2h + g_Wi2h ); - Wh2o -= eta * ( wd * Wh2o + g_Wh2o ); - // no regularization for bias - hbias-= eta * g_hbias; - obias-= eta * g_obias; - } -private: - // random seed generator - Random rnd; - // nodes in neural net - TensorContainer ninput, nhidden, nhiddenbak, nout; - // hidden bias, gradient - TensorContainer hbias, obias, g_hbias, g_obias; - // weight gradient - TensorContainer Wi2h, Wh2o, g_Wi2h, g_Wh2o; -}; - -// helper function to get the max inde -inline int MaxIndex( Tensor pred ){ - int maxidx = 0; - for( index_t i = 1; i < pred.shape[0]; ++i ){ - if( pred[i] > pred[maxidx] ) maxidx = (int)i; - } - return maxidx; -} - -int main( int argc, char *argv[] ){ - if( argc < 2 ){ - printf("Usage: cpu or gpu\n"); return 0; - } - srand(0); - InitTensorEngine(); - - // settings - int batch_size = 100; - int num_in = 28 * 28; - int num_hidden = 100; - int num_out = 10; - - // choose which version to use - INNet *net; - if( !strcmp( argv[1], "gpu") ) { -#if DMSHADOW_USE_CUDA==1 - net = new NNet( batch_size, num_in, num_hidden, num_out ); -#endif - }else{ - net = new NNet( batch_size, num_in, num_hidden, num_out ); - } - - // temp output layer - TensorContainer pred; - pred.Resize( Shape2( batch_size, num_out ) ); - - // label - std::vector ytrain, ytest; - // data - TensorContainer xtrain, xtest; - LoadMNIST( "train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true); - LoadMNIST( "t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false); - - int num_iter = 20; - - for( int i = 0; i < num_iter; ++ i ){ - // training - for( index_t j = 0; j + batch_size <= xtrain.shape[1]; j += batch_size ){ - net->Forward( xtrain.Slice( j, j + batch_size ), pred ); - // set gradient into pred - for( int k = 0; k < batch_size; ++ k ){ - pred[k][ ytrain[k+j] ] -= 1.0f; - } - // scale gradient by batchs zie - pred *= 1.0f / batch_size; - // run backprop - net->Backprop( pred ); - // update net parameters - net->Update(); - } - // evaluation - long nerr = 0; - for( index_t j = 0; j + batch_size <= xtest.shape[1]; j += batch_size ){ - net->Forward( xtest.Slice( j, j + batch_size ), pred ); - for( int k = 0; k < batch_size; ++ k ){ - nerr += MaxIndex( pred[k] ) != ytest[j+k]; - - } - } - printf("round %d: test-err=%f\n", i, (float)nerr/xtest.shape[1] ); - } - delete net; - ShutdownTensorEngine(); - return 0; -} diff --git a/example/neuralnet/run.sh b/example/neuralnet/run.sh deleted file mode 100644 index 8b137891791f..000000000000 --- a/example/neuralnet/run.sh +++ /dev/null @@ -1 +0,0 @@ - diff --git a/example/neuralnet/util.h b/example/neuralnet/util.h deleted file mode 100644 index 50bcef3fdd90..000000000000 --- a/example/neuralnet/util.h +++ /dev/null @@ -1,82 +0,0 @@ -#pragma once -#include -#include -#include -#include "mshadow/tensor.h" - -using namespace mshadow; - -int pack( unsigned char zz[4] ){ - return (int)(zz[3]) - | (((int)(zz[2])) << 8) - | (((int)(zz[1])) << 16) - | (((int)(zz[0])) << 24); -} - -template -inline void shuffle( T *data, size_t sz ){ - if( sz == 0 ) return; - for( size_t i = sz - 1; i > 0; i-- ){ - std::swap( data[i], data[ rand() % ( i+1 ) ] ); - } -} -// random shuffle the data inside, require PRNG -template -inline void shuffle( std::vector &data ){ - shuffle( &data[0], data.size() ); -} - -// simple function to load in mnist -inline void LoadMNIST( const char *path_img, const char *path_label, - std::vector& ylabel, TensorContainer& xdata, bool do_shuffle ){ - // load in data - FILE *fi = fopen( path_img, "rb" ); - if( fi == NULL ){ - printf("cannot open %s\n", path_img ); - exit(-1); - } - unsigned char zz[4]; - unsigned char *t_data, *l_data; - int num_image, width, height, nlabel; - assert( fread(zz, 4 , 1, fi ) ); - assert( fread(zz, 4 , 1, fi ) ); - num_image = pack( zz ); - assert( fread(zz, 4 , 1, fi ) ); - width = pack( zz ); - assert( fread(zz, 4 , 1, fi ) ); - height = pack( zz ); - - int step = width * height; - t_data = new unsigned char[ num_image * step ]; - assert( fread( t_data, step*num_image , 1 , fi ) ); - fclose( fi ); - - // load in label - fi = fopen( path_label, "rb" ); - assert( fread(zz, 4 , 1, fi ) ); - assert( fread(zz, 4 , 1, fi ) ); - nlabel = pack( zz ); - assert( num_image == nlabel ); - l_data = new unsigned char[ num_image ]; - assert( fread( l_data, num_image , 1 , fi ) ); - // try to do shuffle - std::vector rindex; - for( int i = 0; i < num_image; ++ i ){ - rindex.push_back( i ); - } - if( do_shuffle ){ - shuffle( rindex ); - } - - // save out result - ylabel.resize( num_image ); - xdata.Resize( Shape2( num_image, width * height ) ); - for( int i = 0 ; i < num_image ; ++i ){ - for( int j = 0; j < step; ++j ) { - xdata[ i ][ j ] = (float)(t_data[ rindex[i]*step + j ]) / 256.0f; - } - ylabel[ i ] = l_data[ rindex[i] ]; - } - delete[] t_data; delete [] l_data; - printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle ); -} diff --git a/guide/.gitignore b/guide/.gitignore new file mode 100644 index 000000000000..f4ccede58e76 --- /dev/null +++ b/guide/.gitignore @@ -0,0 +1,2 @@ +defop +basic \ No newline at end of file diff --git a/example/Makefile b/guide/Makefile similarity index 72% rename from example/Makefile rename to guide/Makefile index cceb3567f859..930867bb7bf2 100644 --- a/example/Makefile +++ b/guide/Makefile @@ -2,15 +2,17 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../ -export LDFLAGS= -lm -lcudart -lcublas -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) +include config.mk +include ../make/mshadow.mk +export CFLAGS = -Wall -O3 -I../ $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) # specify tensor path BIN = basic defop OBJ = CUOBJ = -CUBIN = +CUBIN = .PHONY: clean all all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) diff --git a/guide/README.md b/guide/README.md new file mode 100644 index 000000000000..ee36ca789208 --- /dev/null +++ b/guide/README.md @@ -0,0 +1,221 @@ +Tutorial of mshadow +===== +This is a beginner's tutorial of mshadow. If you like mshadow and have ideas to improve this tutorial, you are more than welcomed:) +Please send a pull-request if you would like to share your experience. + +See also other related materials about mshadow +* [Expression Template Tutorial](exp-template) +* [Writing Multi-GPU and Distributed ML](mshadow-ps) + +**List of Topics** +* [Tensor Data Structure](#tensor-data-structure) +* [Memory Allocation](#memory-allocation) +* [Elementwise Operations](#elementwise-operations) +* [One code for both CPU and GPU](#one-code-for-both-cpu-and-gpu) +* [Matrix Multiplications](#matrix-multiplications) +* [User Defined Operator](#user-defined-operator) + +Tensor Data Structure +==== +The basic data structure of mshadow is Tensor. The following is a simplified equivalent version of +the declaration in [mashadow/tensor.h](../mshadow/tensor.h) +```c++ +typedef unsigned index_t; +template +struct Shape { + index_t shape_[dimension]; +}; +template +struct Tensor { + DType *dptr_; + Shape shape_; + index_t stride_; +}; +// this is how shape object declaration look like +Shape<2> shape2; +// this is how tensor object declaration look like +// you can +Tensor ts2; +Tensor ts3; +``` +``` Tensor``` means a two dimensional tensor in CPU, while ``` Tensor``` means three dimensional tensor in GPU. +```Shape``` gives the shape information of k-dimensional tensor. The declaration use template, and +can be specialized into tensor of specific device and dimension. This is what two dimensional tensor will look like: +```c++ +struct Shape<2> { + index_t shape_[2]; +}; +struct Tensor { + float *dptr_; + Shape<2> shape_; + index_t stride_; +}; +``` +* ``` Tensor``` contains ```dptr_```, which points to the space that backup the tensor. +* ```Shape<2>``` is a structure that stores shape information, the convention is same as numpy +* ```stride_``` gives the number of cell space allocated in the smallest dimension (if we use numpy convention, the dimension corresponds to shape_[-1]). + This is introduced when we introduce some padding cells in lowest dimension to make sure memory is aligned. + - ```stride_``` is automatically set during memory allocation of tensor in mshadow. + +To understand the data structure, consider the following code: +``` c++ +float data[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; +Tensor ts; +ts.dptr_ = data; +ts.shape_ = mshadow::Shape2(3, 2); +ts.stride_ = 3; +// now: ts[0][0] == 0, ts[0][1] == 1 , ts[1][0] == 3, ts[1][1] == 4 +for (index_t i = 0; i < ts.size(0); ++i) { + for (index_t j = 0; j < ts.size(1), ++j) { + printf("ts[%u][%u]=%f\n", i, j, ts[i][j]); + } +} +``` +The result ts should be a 3 * 2 matrix, where data[2], data[5], data[8] are padding cells that are ignored. If you want a continuous memory, set ```stride_=shape_[1]```. + +Memory Allocation +==== +An important design choice about mshadow is that the data structure is a **whitebox**: +it works so long as we set the space pointer ```dptr_```, corresponding ```shape_``` and ```stride_```: +* For ```Tensor```, the space can be created by ```new float[]```, or pointer to some existing space such as float array in last example. +* For ```Tensor```, the space need to lie in GPU, created by ```cudaMallocPitch``` + +mshadow also provide explicit memory allocation routine, demonstrated shown by following code +``` c++ +// create a 5 x 3 tensor on GPU, and allocate space +Tensor ts2(Shape2(5, 3)); +AllocSpace(&ts2); +// allocate 5 x 3 x 2 tensor on CPU, initialized by 0 +Tensor ts3 = NewTensor(Shape3(5,3,2), 0.0f); +// free space +FreeSpace(&ts2); FreeSpace(&ts3); +``` +All memory allocations in mshadow are **explicit**. There is **no** implicit memory allocation and de-allocation during any operations. +This means ```Tensor``` variable is more like a reference handle(pointer), instead of a object. If we assign a tensor to another variable, the two share the same content space. + +This also allows user to use mshadow in their existing project easily, simply give mshadow the pointer of the memory and you can get the benefit of all the mshadow expressions with zero cost:) + +Elementwise Operations +==== +All the operators(+, -, *, /, += etc.) in mshadow are element-wise. Consider the following SGD update code: +```c++ +void UpdateSGD(Tensor weight, Tensor grad, float eta, float lambda) { + weight -= eta * (grad + lambda * weight); +} +``` +During compilation, this code will be translated to the following form: +```c++ +void UpdateSGD(Tensor weight, Tensor grad, float eta, float lambda) { + for (index_t y = 0; y < weight.size(0); ++y) { + for (index_t x = 0; x < weight.size(1); ++x) { + weight[y][x] -= eta * (grad[y][x] + lambda * weight[y][x]); + } + } +} +``` +As we can see, *no memory allocation* is happened in the translated code. For ```Tensor```, the corresponding function will be translated into a CUDA kernel of same spirit. +Using [Expression Template](exp-template), the translation is happened during compile time. We can write simple lines of code while get the full performance of the translated code. + +One code for both CPU and GPU +==== +Since mshadow have identical interface for ```Tensor``` and ```Tensor```, we can easily write one code that works in both CPU and GPU. +For example, the following code compiles for both GPU and CPU Tensors. +```c++ +template +void UpdateSGD(Tensor weight, const Tensor &grad, + float eta, float lambda) { + weight -= eta * (grad + lambda * weight); +} +``` +Matrix Multiplications +==== +We also have short hands for dot product, as like follows. The code will be translated to call standard packages such as MKL and CuBLAS. +```c++ +template +void Backprop(Tensor gradin, + const Tensor &gradout, + const Tensor &netweight) { + gradin = dot(gradout, netweight.T()); +} +``` +Again, the code can compile for both GPU and CPU Tensors + +User Defined Operator +==== +There are common cases when we want to define our own function. For example, assume we do not have element-wise sigmoid transformation in mshadow, +which is very commonly used in machine learning algorithms. We simply use the following code to add sigmoid to mshadow +```c++ +struct sigmoid { + MSHADOW_XINLINE static float Map(float a) { + return 1.0f / (1.0f + expf(-a)); + } +}; +template +void ExampleSigmoid(Tensor out, const Tensor &in) { + out = F(in * 2.0f) + 1.0f; +} +``` +The equivalent translated code for CPU is given by +```c++ +template +void ExampleSigmoid(Tensor out, const Tensor &in) { + for (index_t y = 0; y < out.size(0); ++y) { + for(index_t x = 0; x < out.size(1); ++x) { + out[y][x] = sigmoid::Map(in[y][x] * 2.0f) + 1.0f; + } + } +} +``` +Also note that the defined operation can be **composited into expressions**, not only we can write ```out = F(in)```, +we can also write ```out = F+2.0``` or ```out = F(F(in))```. + +There will also be a translated CUDA kernel version that runs in GPU. Check out [defop.cpp](defop.cpp) for complete example. + +Complete Example +==== +The following code is from [basic.cpp](basic.cpp), that illustrate basic usage of mshadow. + +```c++ +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // assume we have a float space + float data[20]; + // create a 2 x 5 x 2 tensor, from existing space + Tensor ts(data, Shape3(2,5,2)); + // take first subscript of the tensor + Tensor mat = ts[0]; + // Tensor object is only a handle, assignment means they have same data content + // we can specify content type of a Tensor, if not specified, it is float bydefault + Tensor mat2 = mat; + + // shaape of matrix, note size order is same as numpy + printf("%u X %u matrix\n", mat.size(1), mat.size(1)); + + // initialize all element to zero + mat = 0.0f; + // assign some values + mat[0][1] = 1.0f; mat[1][0] = 2.0f; + // elementwise operations + mat += (mat + 10.0f) / 10.0f + 2.0f; + + // print out matrix, note: mat2 and mat1 are handles(pointers) + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { + printf("%.2f ", mat2[i][j]); + } + printf("\n"); + } + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} +``` + diff --git a/guide/basic.cpp b/guide/basic.cpp new file mode 100644 index 000000000000..cb6586d398d0 --- /dev/null +++ b/guide/basic.cpp @@ -0,0 +1,42 @@ +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // assume we have a float space + float data[20]; + // create a 2 x 5 x 2 tensor, from existing space + Tensor ts(data, Shape3(2,5,2)); + // take first subscript of the tensor + Tensor mat = ts[0]; + // Tensor object is only a handle, assignment means they have same data content + // we can specify content type of a Tensor, if not specified, it is float bydefault + Tensor mat2 = mat; + mat = Tensor(data, Shape1(10)).FlatTo2D(); + + // shaape of matrix, note size order is same as numpy + printf("%u X %u matrix\n", mat.size(0), mat.size(1)); + return 0; + // initialize all element to zero + mat = 0.0f; + // assign some values + mat[0][1] = 1.0f; mat[1][0] = 2.0f; + // elementwise operations + mat += (mat + 10.0f) / 10.0f + 2.0f; + + // print out matrix, note: mat2 and mat1 are handles(pointers) + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { + printf("%.2f ", mat2[i][j]); + } + printf("\n"); + } + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} diff --git a/guide/basic_stream.cu b/guide/basic_stream.cu new file mode 100644 index 000000000000..18dc64ed4c7f --- /dev/null +++ b/guide/basic_stream.cu @@ -0,0 +1,33 @@ +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // create a 2 x 5 tensor, from existing space + Tensor ts1 = NewTensor(Shape2(2, 5), 0.0f); + Tensor ts2 = NewTensor(Shape2(2, 5), 0.0f); + ts1.stream_ = NewStream(); + ts2.stream_ = NewStream(); + ts1 = 1; // Should use stream 0. + ts2 = 2; // Should use stream 1. Can run in parallel with stream 0. + Tensor res = NewTensor(Shape2(2, 2), 0.0f); + res.stream_ = NewStream(); + res = dot(ts1, ts2.T()); //Should use stream 2. + + Tensor cpu_res = NewTensor(Shape2(2, 2), 0.0f); + Copy(cpu_res, res); // default stream, should be 0. + for (index_t i = 0; i < cpu_res.size(0); ++i){ + for (index_t j = 0; j < cpu_res.size(1); ++j){ + printf("%.2f ", cpu_res[i][j]); + } + printf("\n"); + } + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} diff --git a/guide/config.mk b/guide/config.mk new file mode 100644 index 000000000000..b28f41741543 --- /dev/null +++ b/guide/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 0 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas, apple +USE_BLAS = atlas +# +# add path to intel library, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/guide/defop.cpp b/guide/defop.cpp new file mode 100644 index 000000000000..074b81cc141e --- /dev/null +++ b/guide/defop.cpp @@ -0,0 +1,47 @@ +#include +// header file to use mshadow +#include "mshadow/tensor.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +// user defined unary operator addone +struct addone { + // map can be template function + template + MSHADOW_XINLINE static DType Map(DType a) { + return a + static_cast(1); + } +}; +// user defined binary operator max of two +struct maxoftwo { + // map can also be normal functions, + // however, this can only be applied to float tensor + MSHADOW_XINLINE static float Map(float a, float b) { + if(a > b) return a; + else return b; + } +}; + +int main(void) { + // intialize tensor engine before using tensor operation, needed for CuBLAS + InitTensorEngine(); + // take first subscript of the tensor + Tensor mat = NewTensor(Shape2(2,3), 0.0f); + Tensor mat2= NewTensor(Shape2(2,3), 0.0f); + + mat[0][0] = -2.0f; + mat = F(F(mat) + 0.5f, mat2); + + for (index_t i = 0; i < mat.size(0); ++i) { + for (index_t j = 0; j < mat.size(1); ++j) { + printf("%.2f ", mat[i][j]); + } + printf("\n"); + } + FreeSpace(&mat); FreeSpace(&mat2); + // shutdown tensor enigne after usage + ShutdownTensorEngine(); + return 0; +} diff --git a/guide/exp-template/.gitignore b/guide/exp-template/.gitignore new file mode 100644 index 000000000000..fc070ad5bd7e --- /dev/null +++ b/guide/exp-template/.gitignore @@ -0,0 +1 @@ +exp_* \ No newline at end of file diff --git a/example/exp-template/Makefile b/guide/exp-template/Makefile similarity index 100% rename from example/exp-template/Makefile rename to guide/exp-template/Makefile diff --git a/guide/exp-template/README.md b/guide/exp-template/README.md new file mode 100644 index 000000000000..c824d8e4e3c6 --- /dev/null +++ b/guide/exp-template/README.md @@ -0,0 +1,340 @@ +Expression Template Tutorial +==== +This page explains how mshadow works. The main trick behind mshadow is called [Expression Template](http://en.wikipedia.org/wiki/Expression_templates). +We will explain how it will affect the performance of compiled code. Expression template is the major trick behind the C++ matrix libraries such as Eigen, GSL, boost.uBLAS. + +How to write efficient machine learning code +==== +Before we start, let us think of the question above. Assume we want to write down the update rule +```c++ +weight = - eta * (grad + lambda * weight); +``` +Where weight and grad are vectors of length ```n```. When you choose C++ as your programming language, +I guess the major concern is efficiency. There is one principle that is important and used in most C/C++ programs: +* Pre-allocate necessary memory, **no temporal memory allocation** during running. + +An example code is like +```c++ +void UpdateWeight (const float *grad, float eta, float lambda, + int n, float *weight) { + for (int i = 0; i < n; ++i) { + weight[i] = - eta * (grad[i] + lambda * weight[i]); + } +} +``` +The function takes the pre-allocated space grad, and weight, and run the calculation. Writing these functions are simple, +however, it can be annoying when we write them repeatedly. So the question is, can we write as follows, and get same performance as previous code? +```c++ +void UpdateWeight (const Vec& grad, float eta, float lambda, Vec& weight) { + weight = -eta * (grad + lambda * weight); +} +``` +The answer is yes, but not by the most obvious solution. + +A Naive Bad Solution +==== +Let us first take a look at a most straight forward solution: operator overloading. +```c++ +// Naive solution for vector operation overloading +struct Vec { + int len; + float* dptr; + Vec(int len) : len(len) { + dptr = new float[len]; + } + Vec(const Vec& src) : len(src.len) { + dptr = new float[len]; + memcpy(dptr, src.dptr, sizeof(float)*len ); + } + ~Vec(void) { + delete [] dptr; + } +}; + +inline Vec operator+(const Vec &lhs, const Vec &rhs) { + Vec res(lhs.len); + for (int i = 0; i < lhs.len; ++i) { + res.dptr[i] = lhs.dptr[i] + rhs.dptr[i]; + } + return res; +} +``` +If we add more operators overloading in the same style, we can get what we want, and write equations instead of loop. +However, this kind of approach is inefficient, because temporal memory is allocated and de-allocated during each operation, while we could have done better. + +An alternative, more effective way is only overload operator+=, operator-=, which can be implemented without temporal memory allocation. But this limits the equations we can write. + +We will discuss why we still need expression template although C++11 provides move assignment operator and rvalue reference at the end of this tutorial. + +Lazy Evaluation +==== +Let us think why we need temporal memory allocation when doing operator+. This is because we *do not know* the target that will be assigned to in operator+, +otherwise we could have directly storing into target memory instead of temporal memory. + +What if we can know the target? The following code ([exp_lazy.cpp](exp_lazy.cpp)) achieves this. +```c++ +// Example Lazy evaluation code +// for simplicity, we use struct and make all members public +#include +struct Vec; +// expression structure holds the expression +struct BinaryAddExp { + const Vec &lhs; + const Vec &rhs; + BinaryAddExp(const Vec &lhs, const Vec &rhs) + : lhs(lhs), rhs(rhs) {} +}; +// no constructor and destructor to allocate and de-allocate memory, +// allocation done by user +struct Vec { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + inline Vec &operator=(const BinaryAddExp &src) { + for (int i = 0; i < len; ++i) { + dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i]; + } + return *this; + } +}; +// no evaluation happens here +inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) { + return BinaryAddExp(lhs, rhs); +} + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression + A = B + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]); + } + return 0; +} +``` +The idea is that we do not actually do computation in operator+, but only return a expression structure (like abstract syntax tree), +and when we overload operator=, we see the target, as well as all the operands, and we can run computation without introducing extra memory! +Similarly, we can define a DotExp and lazily evaluate at operator=, and redirect matrix(vector) multiplications to BLAS. + + +More Lengthy Expressions and Expression Template +==== +By using lazy evaluation, we are cool by avoiding temporal memory allocations. But the ability of the code is limited: +* We can only write ```A=B+C```, but not more lengthy expressions. +* When we add more expression, we need to write more operator= to evaluate each equations. + +Here is where the magic of template programming comes to rescue. The following code ([exp_template.cpp](exp_template.cpp)), +which is a bit more lengthy, also allows you to write lengthy equations. +```c++ +// Example code, expression template, and more length equations +// for simplicity, we use struct and make all members public +#include + +// this is expression, all expressions must inheritate it, +// and put their type in subtype +template +struct Exp { + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } +}; + +// binary add expression +// note how it is inheritates from Exp +// and put its own type into the template argument +template +struct BinaryAddExp: public Exp > { + const TLhs &lhs; + const TRhs &rhs; + BinaryAddExp(const TLhs& lhs, const TRhs& rhs) + : lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return lhs.Eval(i) + rhs.Eval(i); + } +}; +// no constructor and destructor to allocate +// and de-allocate memory, allocation done by user +struct Vec: public Exp { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + :len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator= (const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); + } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } +}; +// template add, works for any expressions +template +inline BinaryAddExp +operator+(const Exp &lhs, const Exp &rhs) { + return BinaryAddExp(lhs.self(), rhs.self()); +} + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B + C + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f + %f + %f\n", i, + A.dptr[i], B.dptr[i], + C.dptr[i], C.dptr[i]); + } + return 0; +} +``` +The key idea of the code is the template ```Exp``` takes type of its derived class as template argument, so it can convert itself to +the SubType via ```self()```. BinaryAddExp now is a template class that can composite expressions together, like a template version of Composite pattern. +The evaluation is done through function Eval, which is done in a recursive way in BinaryAddExp. +* Due to inlining, the function calls of ```src.Eval(i)``` in ```operator=``` will be compiled into ```B.dptr[i] + C.dptr[i] + C.dptr[i]``` in compile time. +* We can write equations for element-wise operations with same efficiency as if we write a loop + +Make it more flexible +==== +As we can find in the previous example, template programming is a powerful to make things flexible in compile time, our final example, +which is closer to mshadow, allows user customized binary operators ([exp_template_op.cpp](exp_template_op.cpp)). +```c++ +// Example code, expression template +// with binary operator definition and extension +// for simplicity, we use struct and make all members public +#include + +// this is expression, all expressions must inheritate it, +// and put their type in subtype +template +struct Exp{ + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } +}; + +// binary operators +struct mul{ + inline static float Map(float a, float b) { + return a * b; + } +}; + +// binary add expression +// note how it is inheritates from Exp +// and put its own type into the template argument +template +struct BinaryMapExp: public Exp >{ + const TLhs& lhs; + const TRhs& rhs; + BinaryMapExp(const TLhs& lhs, const TRhs& rhs) + :lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return OP::Map(lhs.Eval(i), rhs.Eval(i)); + } +}; +// no constructor and destructor to allocate and de-allocate memory +// allocation done by user +struct Vec: public Exp{ + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator=(const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); + } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } +}; +// template add, works for any expressions +template +inline BinaryMapExp +F(const Exp& lhs, const Exp& rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); +} + +template +inline BinaryMapExp +operator*(const Exp& lhs, const Exp& rhs) { + return F(lhs, rhs); +} + +// user defined operation +struct maximum{ + inline static float Map(float a, float b) { + return a > b ? a : b; + } +}; + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B * F(C, B); + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f * max(%f, %f)\n", + i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]); + } + return 0; +} +``` + +Summary +===== +Up to this point, you should have understand basic ideas how it works: +* Lazy evaluation, to allow us see all the operands and target +* Template composition and recursive evaluation, to allows us evaluate arbitrary composite expressions for element-wise operations. +* Due to template and inlining, writing expressions are as efficient as if we directly write a for loop to implement the update rule:) + +So write expressions when you write machine learning codes, and focus your energy on the algorithm part that matters. + +The Expression Template in MShadow +===== +Expression template in mshadow use the same key points as we introduced in the tutorial, with some minor differences: +* We separate evaluation code from expression construction and composition code. + - Instead of putting Eval in Exp class. A Plan class is created from expression, and used to evaluate the result. + - This allows us to put less variables in Plan, for example, we do not need array length when we evaluate a data. + - One important reason is CUDA kernel cannot take class with const references + - This design choice is debatable, but we find it is useful so far. +* Lazy support for complex expressions such as matrix dot product + - Besides element-wise expressions, we also want to support sugars such as ```A = dot(B.T(), C)```, again, lazy evaluation is used and no extra memory is allocated. +* Type checking and array length checking. + +Notes +==== +* Expression Template and C++11: in C++11, move constructor can be used to save repetitive allocation memory, which removes some need to expression template. However, the space still needs to be allocated at least once. + - This only removes the need of expression template then expression generate space, say dst = A+B+C, dst does not contain space allocated before assignment. + - If we want to keep the syntax that everything is pre-allocated, and expression executes without memory allocation (which is what we did in mshadow), we still need expression template. + diff --git a/guide/exp-template/exp_lazy.cpp b/guide/exp-template/exp_lazy.cpp new file mode 100644 index 000000000000..4e6a6b14b9de --- /dev/null +++ b/guide/exp-template/exp_lazy.cpp @@ -0,0 +1,45 @@ +// Example Lazy evaluation code +// for simplicity, we use struct and make all members public +#include +struct Vec; +// expression structure holds the expression +struct BinaryAddExp { + const Vec &lhs; + const Vec &rhs; + BinaryAddExp(const Vec &lhs, const Vec &rhs) + : lhs(lhs), rhs(rhs) {} +}; +// no constructor and destructor to allocate and de-allocate memory, +// allocation done by user +struct Vec { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + inline Vec &operator=(const BinaryAddExp &src) { + for (int i = 0; i < len; ++i) { + dptr[i] = src.lhs.dptr[i] + src.rhs.dptr[i]; + } + return *this; + } +}; +// no evaluation happens here +inline BinaryAddExp operator+(const Vec &lhs, const Vec &rhs) { + return BinaryAddExp(lhs, rhs); +} + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression + A = B + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f==%f+%f\n", i, A.dptr[i], B.dptr[i], C.dptr[i]); + } + return 0; +} diff --git a/guide/exp-template/exp_template.cpp b/guide/exp-template/exp_template.cpp new file mode 100644 index 000000000000..556b10316a3b --- /dev/null +++ b/guide/exp-template/exp_template.cpp @@ -0,0 +1,72 @@ +// Example code, expression template, and more length equations +// for simplicity, we use struct and make all members public +#include + +// this is expression, all expressions must inheritate it, +// and put their type in subtype +template +struct Exp { + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } +}; + +// binary add expression +// note how it is inheritates from Exp +// and put its own type into the template argument +template +struct BinaryAddExp: public Exp > { + const TLhs &lhs; + const TRhs &rhs; + BinaryAddExp(const TLhs& lhs, const TRhs& rhs) + : lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return lhs.Eval(i) + rhs.Eval(i); + } +}; +// no constructor and destructor to allocate +// and de-allocate memory, allocation done by user +struct Vec: public Exp { + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + :len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator= (const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); + } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } +}; +// template add, works for any expressions +template +inline BinaryAddExp +operator+(const Exp &lhs, const Exp &rhs) { + return BinaryAddExp(lhs.self(), rhs.self()); +} + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B + C + C; + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f + %f + %f\n", i, + A.dptr[i], B.dptr[i], + C.dptr[i], C.dptr[i]); + } + return 0; +} diff --git a/guide/exp-template/exp_template_op.cpp b/guide/exp-template/exp_template_op.cpp new file mode 100644 index 000000000000..249b181ada5b --- /dev/null +++ b/guide/exp-template/exp_template_op.cpp @@ -0,0 +1,92 @@ +// Example code, expression template +// with binary operator definition and extension +// for simplicity, we use struct and make all members public +#include + +// this is expression, all expressions must inheritate it, +// and put their type in subtype +template +struct Exp{ + // returns const reference of the actual type of this expression + inline const SubType& self(void) const { + return *static_cast(this); + } +}; + +// binary operators +struct mul{ + inline static float Map(float a, float b) { + return a * b; + } +}; + +// binary add expression +// note how it is inheritates from Exp +// and put its own type into the template argument +template +struct BinaryMapExp: public Exp >{ + const TLhs& lhs; + const TRhs& rhs; + BinaryMapExp(const TLhs& lhs, const TRhs& rhs) + :lhs(lhs), rhs(rhs) {} + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return OP::Map(lhs.Eval(i), rhs.Eval(i)); + } +}; +// no constructor and destructor to allocate and de-allocate memory +// allocation done by user +struct Vec: public Exp{ + int len; + float* dptr; + Vec(void) {} + Vec(float *dptr, int len) + : len(len), dptr(dptr) {} + // here is where evaluation happens + template + inline Vec& operator=(const Exp& src_) { + const EType &src = src_.self(); + for (int i = 0; i < len; ++i) { + dptr[i] = src.Eval(i); + } + return *this; + } + // evaluation function, evaluate this expression at position i + inline float Eval(int i) const { + return dptr[i]; + } +}; +// template add, works for any expressions +template +inline BinaryMapExp +F(const Exp& lhs, const Exp& rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); +} + +template +inline BinaryMapExp +operator*(const Exp& lhs, const Exp& rhs) { + return F(lhs, rhs); +} + +// user defined operation +struct maximum{ + inline static float Map(float a, float b) { + return a > b ? a : b; + } +}; + +const int n = 3; +int main(void) { + float sa[n] = {1, 2, 3}; + float sb[n] = {2, 3, 4}; + float sc[n] = {3, 4, 5}; + Vec A(sa, n), B(sb, n), C(sc, n); + // run expression, this expression is longer:) + A = B * F(C, B); + for (int i = 0; i < n; ++i) { + printf("%d:%f == %f * max(%f, %f)\n", + i, A.dptr[i], B.dptr[i], C.dptr[i], B.dptr[i]); + } + return 0; +} diff --git a/guide/mshadow-ps/Makefile b/guide/mshadow-ps/Makefile new file mode 100644 index 000000000000..70cb724248f0 --- /dev/null +++ b/guide/mshadow-ps/Makefile @@ -0,0 +1,36 @@ +# set LD_LIBRARY_PATH +export CC = gcc +export CXX = g++ +export NVCC =nvcc +include config.mk +include ../../make/mshadow.mk +export CFLAGS = -Wall -O3 -fopenmp -I../../ $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) + +# specify tensor path +BIN = local_sum.cpu +OBJ = +CUOBJ = +CUBIN = local_sum.gpu +.PHONY: clean all + +all: $(BIN) $(CUBIN) + +local_sum.cpu: local_sum.cpp +local_sum.gpu: local_sum.cu + +$(BIN) : + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) + +$(OBJ) : + $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) + +$(CUOBJ) : + $(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $(filter %.cu, $^) + +$(CUBIN) : + $(NVCC) -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -Xlinker "$(LDFLAGS)" $(filter %.cu %.cpp %.o, $^) + +clean: + $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ diff --git a/guide/mshadow-ps/README.md b/guide/mshadow-ps/README.md new file mode 100644 index 000000000000..3a95798aae98 --- /dev/null +++ b/guide/mshadow-ps/README.md @@ -0,0 +1,174 @@ +mshadow-ps +==== +### Parameter Server Interface for GPU Tensor + +mshadow-ps provides asynchronize parameter server interface for mshadow GPU/CPU Tensor. +This allows you to do ***multi-GPU*** and ***disrtibuted*** (deep) learning in +an ***easy*** and ***unified*** way. + +####List of Resources +* [API Documentation](http://homes.cs.washington.edu/~tqchen/mshadow/doc/namespacemshadow_1_1ps.html) +* [Library Interface Header](../../mshadow-ps/ps.h) +* Tutorial in this page + +Tutorial +==== +Suppose that we are now implementing a Multi-GPU learning program. +One way to do that is through data parallelism. We can launch many +threads, with each thread compute gradient on one GPU, and aggregate +the statistics together. +However, the gradient synchronization step could be cost time, and in +many cases, we can do the computation in an smarter way, so that +we ***overlaps the computation with the synchronization***. + +mshadow-ps provides interface to do such synchronization in an easy way. +The following documents provides a way + +### Getting Sum from Multiple GPUs +We first get familiar with the interface of mshadow-ps. Through the following +program in [local_sum-inl.h](local_sum-inl.h). You can compile the program +by setup the [config.mk](config.mk) according to your computers's enviroment, and type make. + +In the following program, each thread first does some computation locally, then tries to get the sum +of ```data``` through mshadow-ps interface. +There are four key functions in ```ISharedModel``` interface +* [InitKey](../../mshadow-ps/ps.h#L76) allocates a key to specific tensor shape +* [Push](../../mshadow-ps/ps.h#L100) pushes out the local data to the synchronization interface + - The data pushed by different devices will be aggregated together by key + - Push is an asynchronize call and returns immediately +* [PullReq](../../mshadow-ps/ps.h#L122) requests the result of synchronization to be copied back + - In the local default case, the synchronized result is the sum of pushed data + - mshadow-ps also support the weight update on server side, where the result of PullReq is the updated weight instead of sum of gradient + - PullReq is also asynchronize +* [PullWait](../../mshadow-ps/ps.h#L87) wait until the pull request of corresponding key finishes + +```c++ +// this function is runed by specific thread +template +inline void RunWorkerThread(int devid, + mshadow::ps::ISharedModel *ps) { + // initialize tensor engine + mshadow::InitTensorEngine(devid); + mshadow::Stream *stream = mshadow::NewStream(); + // allocate tensor on xpu + mshadow::TensorContainer data(mshadow::Shape2(2, 3)); + // set the computation stream to the new allocated stream + // this will make subsequent computation whose target is data + // to use the stream, stream is needed for async execution in GPU + data.set_stream(stream); + // assume these operations sets the content of dataient + data[0] = 1.0f; + data[1] = devid + data[0]; + printf("dev%d: before sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + // intiaialize the key, register the shape on parameter server + ps->InitKey(data[0].shape_, 0, devid); + ps->InitKey(data[1].shape_, 1, devid); + // push data[0] out, for update, or aggregation + // 0 is the key of the data, devid is the current device id + ps->Push(data[0], 0, devid); + // pull request is used to request the data to be copied back + // once computation is done + ps->PullReq(data[0], 0, devid); + // computation can be done here.. + // the pull request handler will be overlapped with + // similar as previous call + ps->Push(data[1], 1, devid); + ps->PullReq(data[1], 1, devid); + // more computation can be done here... + // the computation will be overlapped + // PullWait will block until these request finishes + ps->PullWait(0, devid); + ps->PullWait(1, devid); + printf("dev%d: after sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + mshadow::DeleteStream(stream); + mshadow::ShutdownTensorEngine(); +} + +template +inline int Run(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: device list\n"\ + "\tfor CPU the device list can be arbitrary\n"\ + "\tfor GPU the device list need to be actual device index\n"); + return 0; + } + // list of device ids + std::vector devs; + // initialization + for (int i = 1; i < argc; ++i) { + // record the device id + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + // intiaialize the ps + ps->Init(devs); + // use openmp to launch #devs threads + #pragma omp parallel num_threads(devs.size()) + { + int tid = omp_get_thread_num(); + RunWorkerThread(devs[tid], ps); + } + delete ps; + return 0; +} +``` +In the above example, we did not do weight update on server side, so the synchronization result is +simply the sum of data on each device. The key property of this interface is that the Push and PullReq are asynchronize. +* We can call these two functions once the gradient is ready, and the mshadow-ps will do the data synchronization in the background. +* When we need the result of synchronization, we simply call PullWait to wait the synchronization task to finish. +* Such interface allows us to do additional computation between the Push/PullReq and PullWait + +### A MultiGPU Neural Net +To get a more concrete understanding of the interface. We give an example of multi-GPU two layer neuralnet +in [../neuralnet/nnet_ps.cu](../neuralnet/nnet_ps.cu). The general idea is follows +* Push and PullReq is called once we get the gradient of certain layer +* PullWait is called before we do forward on that layer next time +* This creates a ***time lag*** between the backprop and next forward to that layer + - mshadow-ps do synchronization concurrently with computations during the time lag + - The time lag is big for latter layers, which also usually need more time to synchronize + +There are several note of the mshadow-ps on the neural net code +* Callback function in PullReq + - A callback function can be pass to PullReq to be called when the request complete + - We place weight update in the callback to perform update when we get the gradient sum +* Computing stream + - Due to GPU's programming model, we need to do computation on non-default stream + - Use set_stream in mshadow tensors to set stream to computation stream + - To report error when you did not use stream, you can compile with -DMSHADOW_FORCE_STREAM + +We should note thate because the example runs on MNIST, which is an quite small dataset, you may not observe +speedup with multiple cards. However, you will find significant speedup when you run on other tasks. +The newest version of [cxxnet](https://github.com/antinucleon/cxxnet) + +### Moving Parameter Update to the Server +In all the examples so far, we use mshadow-ps to get the aggregated sum of gradients, and update +weights locally on each GPU. For more advanced usage of mshadow-ps, we can move the weight update +to the server. The communication pattern is as follows +* Each thread still call Push to push out gradient +* The server will apply the update rule to update the weight +* Each thread call PullReq to pull back the weight from server + +Such update pattern is suitable under distributed setting. To do so, user need to implement an +[IModelUpdater](../../mshadow-ps/ps.h#L202) interface. And define the following CreateModelUpdater function +in the program +```c++ +namespace mshadow { +namespace ps { +template<> +IModelUpdater *CreateModelUpdater() { + return new MyModelUpdater(); +} +} +} +``` +Before calling ISharedModel.Init, user need to call ```ps->SetParam("update_on_server", "1")``` to set the update +mode on the server side. If user uses distributed shared model, user must define ModelUpdater. diff --git a/guide/mshadow-ps/config.mk b/guide/mshadow-ps/config.mk new file mode 100644 index 000000000000..834b430c0f8c --- /dev/null +++ b/guide/mshadow-ps/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 1 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas, apple +USE_BLAS = atlas +# +# add path to intel library, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/guide/mshadow-ps/local_sum-inl.h b/guide/mshadow-ps/local_sum-inl.h new file mode 100644 index 000000000000..5120590a2768 --- /dev/null +++ b/guide/mshadow-ps/local_sum-inl.h @@ -0,0 +1,113 @@ +// This is an example demonstrating the usage of mshadow ps +#include +// use openmp to launch multiple threads +#include +#include +#include + +// simple util to print result +void Print_(mshadow::Tensor ts) { + for (mshadow::index_t i = 0; i < ts.size(0); ++i) { + for (mshadow::index_t j = 0; j < ts.size(1); ++j) { + printf("%g ", ts[i][j]); + } + printf("\n"); + } +} +template +inline void Print(mshadow::Tensor ts) { + mshadow::TensorContainer tmp; + tmp.Resize(ts.shape_); + mshadow::Copy(tmp, ts); + Print_(tmp); +} + +// this function is runed by specific thread +template +inline void RunWorkerThread(int devid, + mshadow::ps::ISharedModel *ps) { + // initialize tensor engine + mshadow::InitTensorEngine(devid); + mshadow::Stream *stream = mshadow::NewStream(); + // allocate tensor on xpu + mshadow::TensorContainer data(mshadow::Shape2(2, 3)); + // set the computation stream to the new allocated stream + // this will make subsequent computation whose target is data + // to use the stream, stream is needed for async execution in GPU + data.set_stream(stream); + // assume these operations sets the content of dataient + data[0] = 1.0f; + data[1] = devid + data[0]; + printf("dev%d: before sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + // intiaialize the key, register the shape on parameter server + ps->InitKey(data[0].shape_, 0, devid); + ps->InitKey(data[1].shape_, 1, devid); + // push data[0] out, for update, or aggregation + // 0 is the key of the data, devid is the current device id + ps->Push(data[0], 0, devid); + // pull request is used to request the data to be copied back + // once computation is done + ps->PullReq(data[0], 0, devid); + // computation can be done here.. + // the pull request handler will be overlapped with + // similar as previous call + ps->Push(data[1], 1, devid); + ps->PullReq(data[1], 1, devid); + // more computation can be done here... + // the computation will be overlapped + // PullWait will block until these request finishes + ps->PullWait(0, devid); + ps->PullWait(1, devid); + printf("dev%d: after sync, data:\n", devid); + // use print to show result, do not call + // print normally since Copy will block + Print(data); + printf("====================\n"); + mshadow::DeleteStream(stream); + mshadow::ShutdownTensorEngine(); +} + +namespace mshadow { +namespace ps { +// model updater is used when update is happening on server side +// if we only use parameter server for sum aggregation +// this is not needed, but we must declare this function to return NULL +template<> +IModelUpdater *CreateModelUpdater(void) { + return NULL; +} +} +} + +template +inline int Run(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: device list\n"\ + "\tfor CPU the device list can be arbitrary\n"\ + "\tfor GPU the device list need to be actual device index\n"); + return 0; + } + // list of device ids + std::vector devs; + // initialization + for (int i = 1; i < argc; ++i) { + // record the device id + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + // intiaialize the ps + ps->Init(devs); + // use openmp to launch #devs threads + #pragma omp parallel num_threads(devs.size()) + { + int tid = omp_get_thread_num(); + RunWorkerThread(devs[tid], ps); + } + delete ps; + return 0; +} diff --git a/guide/mshadow-ps/local_sum.cpp b/guide/mshadow-ps/local_sum.cpp new file mode 100644 index 000000000000..7f0eed0df42e --- /dev/null +++ b/guide/mshadow-ps/local_sum.cpp @@ -0,0 +1,4 @@ +#include "./local_sum-inl.h" +int main(int argc, char *argv[]) { + return Run(argc, argv); +} diff --git a/guide/mshadow-ps/local_sum.cu b/guide/mshadow-ps/local_sum.cu new file mode 100644 index 000000000000..6e839601a265 --- /dev/null +++ b/guide/mshadow-ps/local_sum.cu @@ -0,0 +1,4 @@ +#include "./local_sum-inl.h" +int main(int argc, char *argv[]) { + return Run(argc, argv); +} diff --git a/example/neuralnet/Makefile b/guide/neuralnet/Makefile similarity index 54% rename from example/neuralnet/Makefile rename to guide/neuralnet/Makefile index 7cb45e4afa2d..826384b5f3b0 100644 --- a/example/neuralnet/Makefile +++ b/guide/neuralnet/Makefile @@ -2,31 +2,27 @@ export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../../ - - -ifeq ($(blas),1) - LDFLAGS= -lcblas -lm -lcudart -lcublas -lcurand - CFLAGS+= -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1 -else - LDFLAGS= -lm -lcudart -lcublas -lcurand -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -endif -export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) +include config.mk +include ../../make/mshadow.mk +export CFLAGS = -Wall -O3 -I../../ -fopenmp $(MSHADOW_CFLAGS) +export LDFLAGS= -lm $(MSHADOW_LDFLAGS) +export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS) # specify tensor path BIN = OBJ = CUOBJ = -CUBIN = nnet convnet +CUBIN = nnet convnet nnet_ps .PHONY: clean all all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) nnet: nnet.cu +nnet_ps: nnet_ps.cu convnet: convnet.cu $(BIN) : - $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) + $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) $(OBJ) : $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) @@ -39,3 +35,4 @@ $(CUBIN) : clean: $(RM) $(OBJ) $(BIN) $(CUBIN) $(CUOBJ) *~ + diff --git a/guide/neuralnet/README.md b/guide/neuralnet/README.md new file mode 100644 index 000000000000..dd181e758c65 --- /dev/null +++ b/guide/neuralnet/README.md @@ -0,0 +1,16 @@ +Example Neural Net code with MShadow +==== + +To compile the code, modify ```config.mk``` to the setting you like and type make +* You will need to have CUDA and a version of BLAS + +To run the demo, download MNIST dataset from: http://yann.lecun.com/exdb/mnist/ +unzip all the files into current folder + +and run by ./nnet cpu or ./nnet gpu. ./convnet cpu or ./convnet gpu + +MultiGPU Version +==== +* If you have two GPUs, you can run it by ```./nnet_ps gpu 0 1```. +* You can run it using CPUs ```./nnet_ps cpu 0 1```. +* This is an demonstration of mshadow-ps interface, see introduction in [../mshadow-ps](../mshadow-ps) diff --git a/guide/neuralnet/config.mk b/guide/neuralnet/config.mk new file mode 100644 index 000000000000..112396d5557b --- /dev/null +++ b/guide/neuralnet/config.mk @@ -0,0 +1,35 @@ +#--------------------------------------------------------------------------------------- +# mshadow: the configuration compile script +# +# This is configuration script that you can use to compile mshadow +# Usage: +# +# include config.mk in your Makefile, or directly include the definition of variables +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +# whether use CUDA during compile +USE_CUDA = 1 + +# add the path to CUDA libary to link and compile flag +# if you have already add them to enviroment variable, leave it as NONE +USE_CUDA_PATH = NONE + +# +# choose the version of blas you want to use +# can be: mkl, blas, atlas, openblas, apple +USE_BLAS = mkl +# +# add path to intel library, you may need it +# for MKL, if you did not add the path to enviroment variable +# +USE_INTEL_PATH = NONE + +# whether compile with parameter server +USE_DIST_PS = 0 +PS_PATH = NONE +PS_THIRD_PATH = NONE diff --git a/guide/neuralnet/convnet.cu b/guide/neuralnet/convnet.cu new file mode 100644 index 000000000000..97b6a03fc416 --- /dev/null +++ b/guide/neuralnet/convnet.cu @@ -0,0 +1,267 @@ +// this implements a simple convolution neural net: conv-maxpool-fullc +#include +// header file to use mshadow +#include "mshadow/tensor.h" +// helper function to load mnist dataset +#include "util.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +// define operations +struct relu{ + MSHADOW_XINLINE static real_t Map(real_t a) { + using namespace std; + return max(a, 0.0f); + } +}; +struct relu_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + return a > 0.0f ? 1.0f : 0.0f; + } +}; + +/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ +class INNet{ + public: + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) = 0; + virtual void Backprop(const Tensor& gradout) = 0; + virtual void Update(void) = 0; + virtual ~INNet() {} +}; + +/*! + * \brief simple two layer conv-net conv-pool-flat-fullc + * this implementation is device invariant + */ +template +class ConvNet : public INNet { + public: + // initialize the network + ConvNet(int batch_size, int insize, int nchannel, int ksize, int kstride, int psize, int num_out) + :rnd(0), ksize(ksize), kstride(kstride), psize(psize) { + // setup nodes + ninput.Resize(Shape4(batch_size, 1, insize, insize)); + nhidden.Resize(Shape4(batch_size, nchannel, (insize - ksize)/kstride+1, (insize -ksize)/kstride+1)); + nhiddenbak.Resize(nhidden.shape_); + npool.Resize(Shape4(batch_size, nchannel, (nhidden.size(2)+1-psize)/psize, (nhidden.size(3)+1-psize)/psize)); + npoolbak.Resize(npool.shape_); + nflat.Resize(Shape2(batch_size, npool.size(1)*npool.size(2)*npool.size(3))); + nout.Resize(Shape2(batch_size, num_out)); + // setup bias + hbias.Resize(Shape1(nchannel)); g_hbias.Resize(hbias.shape_); + obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_); + hbias = 0.0f; obias = 0.0f; + // setup weights + Ki2h.Resize(Shape2(nchannel, ksize*ksize)); g_Ki2h.Resize(Ki2h.shape_); + Wh2o.Resize(Shape2(nflat.size(1), num_out)); g_Wh2o.Resize(Wh2o.shape_); + rnd.SampleGaussian(&Ki2h, 0, 0.01f); + rnd.SampleGaussian(&Wh2o, 0, 0.01f); + + printf("conv=%d, pool=%d\n", nhidden.size(3), npool.size(3)); + } + virtual ~ConvNet() {} + // forward propagation + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) { + index_t batch_size = inbatch.size(0); + // copy data to input layer + Copy(ninput, inbatch); + // first layer, conv, use stride=2 + ConvForward(ninput, Ki2h, nhidden, ksize, kstride, tmp_col, tmp_dst); + // add bias + nhidden += broadcast<1>(hbias, nhidden.shape_); + // activation, relu, backup activation in nhidden + nhidden = F(nhidden); + Copy(nhiddenbak, nhidden); + // max pooling + npool = pool(nhiddenbak, npool[0][0].shape_, psize, psize, psize); + Copy(npoolbak, npool); + // flat + nflat = reshape(npool, nflat.shape_); + // second layer fullc + nout = dot(nflat, Wh2o); + nout += repmat(obias, batch_size); + // softmax calculation + Softmax(nout, nout); + // copy result out + Copy(oubatch, nout); + } + // back propagation + virtual void Backprop(const Tensor& gradout) { + // copy gradient to output layer + Copy(nout, gradout); + // calc grad of final layer + g_obias = sum_rows(nout); + g_Wh2o = dot(nflat.T(), nout); + // backprop to previous layer + nflat = dot(nout, Wh2o.T()); + npool = reshape(nflat, npool.shape_); + // backprop pooling layer + nhiddenbak = unpool(nhiddenbak, npoolbak, npool, psize, psize, psize); + // calculate gradient of relu layer + nhidden = F(nhidden) * nhiddenbak; + // calc grad of layer 1 + g_hbias = sumall_except_dim<1>(nhidden); + ConvBackWard(nhidden, Ki2h, g_Ki2h, ninput, ksize, kstride, tmp_col, tmp_dst); + } + // update weight + virtual void Update(void) { + // run SGD + const float eta = 0.1; + const float wd = 0.00001; + // update weight + Ki2h -= eta * (wd * Ki2h + g_Ki2h); + Wh2o -= eta * (wd * Wh2o + g_Wh2o); + // no regularization for bias + hbias-= eta * g_hbias; + obias-= eta * g_obias; + } + private: + // forward convolution, tmp_col and tmp_dst are helper structure + inline static void ConvForward(const Tensor &in, + const Tensor &kernel, + Tensor &out, + int ksize, int kstride, + TensorContainer &tmp_col, + TensorContainer &tmp_dst) { + index_t oheight = (in.size(2) - ksize)/kstride + 1; + index_t owidth = (in.size(3) - ksize)/kstride + 1; + index_t nbatch = in.size(0); + index_t nchannel = out.size(1); + // we directly unpack all local patches and do a dot product + // this cost lots of memory, normally for large image, only unpack several image at a time + tmp_col.Resize(Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth)); + tmp_dst.Resize(Shape2(nchannel, nbatch*oheight*owidth)); + // unpack local patches , stride=1 + tmp_col = unpack_patch2col(in, ksize, ksize, kstride); + tmp_dst = dot(kernel, tmp_col); + // reshape, then swap axis, we chain equations together + out = swapaxis<1,0>(reshape(tmp_dst, Shape4(nchannel, nbatch, oheight, owidth))); + } + // backward convolution, calculate gradient of kernel, and backprop back to in + inline static void ConvBackWard(const Tensor &out, + const Tensor &kernel, + Tensor &g_kernel, + Tensor &in, + int ksize, int kstride, + TensorContainer &tmp_col, + TensorContainer &tmp_dst) { + index_t oheight = (in.size(2) - ksize)/kstride + 1; + index_t owidth = (in.size(3) - ksize)/kstride + 1; + index_t nbatch = in.size(0); + index_t nchannel = out.size(1); + // we directly unpack all local patches and do a dot product + // this cost lots of memory, normally for large image, only unpack several image at a time + tmp_col.Resize(Shape2(in.size(1) * ksize * ksize, + nbatch * oheight * owidth)); + tmp_dst.Resize(Shape2(nchannel, nbatch * oheight * owidth)); + // unpack local patches + tmp_col = unpack_patch2col(in, ksize, ksize, kstride); + tmp_dst = reshape(swapaxis<1,0>(out), tmp_dst.shape_); + g_kernel = dot(tmp_dst, tmp_col.T()); + // backpropgation: not necessary for first layer, but included anyway + tmp_col = dot(kernel.T(), tmp_dst); + in = pack_col2patch(tmp_col, in.shape_, ksize, ksize, kstride); + } + private: + // random seed generator + Random rnd; + // kernel size, pooling size + int ksize, kstride, psize; + // nodes in neural net + TensorContainer ninput, nhidden, nhiddenbak, npool, npoolbak; + TensorContainer nflat, nout; + // temp helper structure + TensorContainer tmp_col, tmp_dst; + // hidden bias, gradient + TensorContainer hbias, obias, g_hbias, g_obias; + // weight, gradient: Ki2h is actually convoltuion kernel, with shape=(num_channel,ksize*ksize) + TensorContainer Ki2h, Wh2o, g_Ki2h, g_Wh2o; +}; + +// helper function to get the max inde +inline int MaxIndex(Tensor pred) { + int maxidx = 0; + for (index_t i = 1; i < pred.size(0); ++i) { + if(pred[i] > pred[maxidx]) maxidx = (int)i; + } + return maxidx; +} + +int main(int argc, char *argv[]) { + if(argc < 2) { + printf("Usage: cpu or gpu\n"); return 0; + } + srand(0); + // settings + int batch_size = 100; + int insize = 28; + int nchannel = 10; + int ksize = 5; + int kstride = 1; + int psize = 2; + int num_out = 10; + + // choose which version to use + INNet *net; + if (!strcmp(argv[1], "gpu")) { + InitTensorEngine(); + net = new ConvNet(batch_size, insize, nchannel, ksize, kstride, psize, num_out); + } else { + InitTensorEngine(); + net = new ConvNet(batch_size, insize, nchannel, ksize, kstride, psize, num_out); + } + + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(batch_size, num_out)); + + // label + std::vector ytrain, ytest; + // data + TensorContainer xtrain_, xtest_; + LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain_, true); + LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest_, false); + + TensorContainer xtrain(Shape4(xtrain_.size(0), 1, insize, insize)); + TensorContainer xtest(Shape4(xtest_.size(0), 1, insize, insize)); + xtrain = reshape(xtrain_, xtrain.shape_); + xtest = reshape(xtest_, xtest.shape_); + + int num_iter = 20; + + for (int i = 0; i < num_iter; ++ i) { + // training + for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) { + net->Forward(xtrain.Slice(j, j + batch_size), pred); + // set gradient into pred + for (int k = 0; k < batch_size; ++ k) { + pred[k][ ytrain[k+j] ] -= 1.0f; + } + // scale gradient by batchs zie + pred *= 1.0f / batch_size; + // run backprop + net->Backprop(pred); + // update net parameters + net->Update(); + } + // evaluation + long nerr = 0; + for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) { + net->Forward(xtest.Slice(j, j + batch_size), pred); + for (int k = 0; k < batch_size; ++ k) { + nerr += MaxIndex(pred[k]) != ytest[j+k]; + } + } + printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); + } + delete net; + + if (!strcmp(argv[1], "gpu")) { + ShutdownTensorEngine(); + } else { + ShutdownTensorEngine(); + } + return 0; +} diff --git a/guide/neuralnet/nnet.cu b/guide/neuralnet/nnet.cu new file mode 100644 index 000000000000..8e79cf608f3c --- /dev/null +++ b/guide/neuralnet/nnet.cu @@ -0,0 +1,188 @@ +// this implements a simple two layer neural net +#include +#include +// header file to use mshadow +#include "mshadow/tensor.h" +// helper function to load mnist dataset +#include "util.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +// define sigmoid operation +struct sigmoid{ + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f/(1.0f+expf(-a)); + } +}; + +/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ +class INNet{ + public: + virtual void Forward(const Tensor& inbatch, Tensor &oubatch) = 0; + virtual void Backprop(const Tensor& gradout) = 0; + virtual void Update(void) = 0; + virtual ~INNet() {} +}; + +/*! + * \brief simple two layer neural net + * this implementation is device invariant + */ +template +class NNet : public INNet { + public: + // initialize the network + NNet(int batch_size, int num_in, int num_hidden, int num_out) : rnd(0) { + // setup nodes + ninput.Resize(Shape2(batch_size, num_in)); + nhidden.Resize(Shape2(batch_size, num_hidden)); + nhiddenbak.Resize(nhidden.shape_); + nout.Resize(Shape2(batch_size, num_out)); + // setup bias + hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_); + obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_); + hbias = 0.0f; obias = 0.0f; + // setup weights + Wi2h.Resize(Shape2(num_in, num_hidden)); g_Wi2h.Resize(Wi2h.shape_); + Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_); + rnd.SampleGaussian(&Wi2h, 0, 0.01f); + rnd.SampleGaussian(&Wh2o, 0, 0.01f); + } + virtual ~NNet() {} + // forward propagation + virtual void Forward(const Tensor& inbatch, + Tensor &oubatch) { + // size is same conventsion as numpy + index_t batch_size = inbatch.size(0); + // copy data to input layer + Copy(ninput, inbatch); + // first layer, fullc + nhidden = dot(ninput, Wi2h); + nhidden+= repmat(hbias, batch_size); + // activation, sigmloid, backup activation in nhidden + nhidden = F(nhidden); + Copy(nhiddenbak, nhidden); + // second layer fullc + nout = dot(nhiddenbak, Wh2o); + nout += repmat(obias, batch_size); + // softmax calculation + Softmax(nout, nout); + // copy result out + Copy(oubatch, nout); + } + // back propagation + virtual void Backprop(const Tensor& gradout) { + // copy gradient to output layer + Copy(nout, gradout); + // calc grad of layer 2 + g_obias = sum_rows(nout); + g_Wh2o = dot(nhiddenbak.T(), nout); + // backprop to layer 1 + nhiddenbak = dot(nout, Wh2o.T()); + // calculate gradient of sigmoid layer + nhidden = nhidden * (1.0f-nhidden) * nhiddenbak; + // calc grad of layer 1 + g_hbias = sum_rows(nhidden); + g_Wi2h = dot(ninput.T(), nhidden); + } + // update weight + virtual void Update(void) { + // run SGD + const float eta = 0.8; + const float wd = 0.00001; + // update weight + Wi2h -= eta * (wd * Wi2h + g_Wi2h); + Wh2o -= eta * (wd * Wh2o + g_Wh2o); + // no regularization for bias + hbias-= eta * g_hbias; + obias-= eta * g_obias; + } + private: + // random seed generator + Random rnd; + // nodes in neural net + TensorContainer ninput, nhidden, nhiddenbak, nout; + // hidden bias, gradient + TensorContainer hbias, obias, g_hbias, g_obias; + // weight gradient + TensorContainer Wi2h, Wh2o, g_Wi2h, g_Wh2o; +}; +// helper function to get the max inde +inline int MaxIndex(Tensor pred) { + int maxidx = 0; + for(index_t i = 1; i < pred.size(0); ++i) { + if(pred[i] > pred[maxidx]) maxidx = (int)i; + } + return maxidx; +} + +int main(int argc, char *argv[]) { + if(argc < 2) { + printf("Usage: cpu or gpu\n"); return 0; + } + srand(0); + + // settings + int batch_size = 100; + int num_in = 28 * 28; + int num_hidden = 100; + int num_out = 10; + // choose which version to use + INNet *net; + if (!strcmp(argv[1], "gpu")) { + InitTensorEngine(); + net = new NNet(batch_size, num_in, num_hidden, num_out); + } else { + InitTensorEngine(); + net = new NNet(batch_size, num_in, num_hidden, num_out); + } + + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(batch_size, num_out)); + + // label + std::vector ytrain, ytest; + // data + TensorContainer xtrain, xtest; + LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true); + LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false); + + int num_iter = 20; + + for (int i = 0; i < num_iter; ++ i) { + // training + for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) { + net->Forward(xtrain.Slice(j, j + batch_size), pred); + // set gradient into pred + for (int k = 0; k < batch_size; ++ k) { + pred[k][ ytrain[k+j] ] -= 1.0f; + } + // scale gradient by batchs zie + pred *= 1.0f / batch_size; + // run backprop + net->Backprop(pred); + // update net parameters + net->Update(); + } + // evaluation + long nerr = 0; + for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) { + net->Forward(xtest.Slice(j, j + batch_size), pred); + for (int k = 0; k < batch_size; ++ k) { + nerr += MaxIndex(pred[k]) != ytest[j+k]; + + } + } + printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); + } + delete net; + if (!strcmp(argv[1], "gpu")) { + ShutdownTensorEngine(); + } else { + ShutdownTensorEngine(); + } + return 0; +} diff --git a/guide/neuralnet/nnet_ps.cu b/guide/neuralnet/nnet_ps.cu new file mode 100644 index 000000000000..996bbe266d7b --- /dev/null +++ b/guide/neuralnet/nnet_ps.cu @@ -0,0 +1,312 @@ +// this implements a simple two layer Multi-GPU neural net +// this implementation uses mshadow-ps to get gradient aggregation +// between cards +// this code is modified from nnet.cu +#include +#include +#include +// header file to use mshadow +#include +#include +// helper function to load mnist dataset +#include "./util.h" +// this namespace contains all data structures, functions +using namespace mshadow; +// this namespace contains all operator overloads +using namespace mshadow::expr; + +// define sigmoid operation +struct sigmoid { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f / (1.0f + expf(-a)); + } +}; + +/*! \brief interface for nnet, interfacd allows use to use GPU/CPU implementation in a unified way */ +class INNet{ + public: + virtual void Forward(const Tensor& inbatch, + Tensor &oubatch) = 0; + virtual void Backprop(const Tensor& gradout) = 0; + virtual ~INNet() {} +}; + +/*! + * \brief simple two layer neural net + * this implementation is device invariant + */ +template +class NNet : public INNet { + public: + // initialize the network + NNet(int batch_size, int num_in, int num_hidden, int num_out, + int devid, mshadow::ps::ISharedModel *ps) + : rnd(0), devid(devid), ps(ps) { + mshadow::SetDevice(devid); + stream = mshadow::NewStream(); + // set the computing streams + ninput.set_stream(stream); + nhidden.set_stream(stream); + nhiddenbak.set_stream(stream); + nout.set_stream(stream); + hbias.set_stream(stream); + obias.set_stream(stream); + g_hbias.set_stream(stream); + g_obias.set_stream(stream); + Wi2h.set_stream(stream); + Wh2o.set_stream(stream); + g_Wi2h.set_stream(stream); + g_Wh2o.set_stream(stream); + rnd.set_stream(stream); + // setup nodes + ninput.Resize(Shape2(batch_size, num_in)); + nhidden.Resize(Shape2(batch_size, num_hidden)); + nhiddenbak.Resize(nhidden.shape_); + nout.Resize(Shape2(batch_size, num_out)); + // setup bias + hbias.Resize(Shape1(num_hidden)); g_hbias.Resize(hbias.shape_); + obias.Resize(Shape1(num_out)); g_obias.Resize(obias.shape_); + hbias = 0.0f; obias = 0.0f; + // setup weights + Wi2h.Resize(Shape2(num_in, num_hidden)); g_Wi2h.Resize(Wi2h.shape_); + Wh2o.Resize(Shape2(num_hidden, num_out)); g_Wh2o.Resize(Wh2o.shape_); + rnd.SampleGaussian(&Wi2h, 0, 0.01f); + rnd.SampleGaussian(&Wh2o, 0, 0.01f); + // initialize the key + ps->InitKey(Wi2h.shape_, 0, devid); + ps->InitKey(hbias.shape_, 1, devid); + ps->InitKey(Wh2o.shape_, 2, devid); + ps->InitKey(obias.shape_, 3, devid); + } + virtual ~NNet() { + mshadow::SetDevice(devid); + mshadow::DeleteStream(stream); + } + // forward propagation + virtual void Forward(const Tensor &inbatch, + Tensor &oubatch) { + // size is same conventsion as numpy + index_t batch_size = inbatch.size(0); + // copy data to input layer + Copy(ninput, inbatch, stream); + // wait the last pull requst on layer to complete + ps->PullWait(0, devid); + // first layer, fullc + nhidden = dot(ninput, Wi2h); + // wait the pull request on hbias to complete + ps->PullWait(1, devid); + nhidden+= repmat(hbias, batch_size); + // activation, sigmloid, backup activation in nhidden + nhidden = F(nhidden); + Copy(nhiddenbak, nhidden, stream); + // second layer fullc + ps->PullWait(2, devid); + nout = dot(nhiddenbak, Wh2o); + ps->PullWait(3, devid); + nout += repmat(obias, batch_size); + // softmax calculation + Softmax(nout, nout); + // copy result out + Copy(oubatch, nout, stream); + // Copy with stream is non-blocking, use wait to wait until copy finishes + stream->Wait(); + } + // back propagation + virtual void Backprop(const Tensor &gradout) { + // copy gradient to output layer + Copy(nout, gradout, stream); + // calc grad of layer 2 + g_obias = sum_rows(nout); + // sync proc defines the synchronization step + this->SyncProc(obias, g_obias, 3); + // update second layer weights + g_Wh2o = dot(nhiddenbak.T(), nout); + // backprop to layer 1 + nhiddenbak = dot(nout, Wh2o.T()); + this->SyncProc(Wh2o, g_Wh2o, 2); + // calculate gradient of sigmoid layer + nhidden = nhidden * (1.0f-nhidden) * nhiddenbak; + // calc grad of layer 1 + g_hbias = sum_rows(nhidden); + this->SyncProc(hbias, g_hbias, 1); + g_Wi2h = dot(ninput.T(), nhidden); + this->SyncProc(Wi2h, g_Wi2h, 0); + } + // synchronization function + template + inline void SyncProc(mshadow::Tensor weight, + mshadow::Tensor grad, + int data_key) { + // wait till last computation finishes + stream->Wait(); + ps->Push(grad, data_key, devid, -data_key); + ps->PullReq(grad, data_key, devid, -data_key, + UpdateEntry::ApplyUpdate, + new UpdateEntry(weight.FlatTo2D(), grad.FlatTo2D(), dim == 1)); + } + // data structure defined to help using callback function + struct UpdateEntry { + mshadow::Tensor weight; + mshadow::Tensor grad; + bool is_bias; + // constructor + UpdateEntry(mshadow::Tensor weight, + mshadow::Tensor grad, + bool is_bias) + : weight(weight), grad(grad), + is_bias(is_bias) {} + inline void Update(mshadow::Stream *stream) { + weight.set_stream(stream); + const float wd = 0.00001; + const float eta = 0.8; + if (!is_bias) { + weight -= eta * (wd * weight + grad); + } else { + weight -= eta * grad; + } + } + // callback function to apply update + inline static void ApplyUpdate(mshadow::Stream *stream, void *arg) { + UpdateEntry *e = static_cast(arg); + e->Update(stream); + delete e; + } + }; + + private: + // computing stream + mshadow::Stream *stream; + // device id + int devid; + // parameter server interface + mshadow::ps::ISharedModel *ps; + // random seed generator + Random rnd; + // nodes in neural net + TensorContainer ninput, nhidden, nhiddenbak, nout; + // hidden bias, gradient + TensorContainer hbias, obias, g_hbias, g_obias; + // weight gradient + TensorContainer Wi2h, Wh2o, g_Wi2h, g_Wh2o; +}; + +// helper function to get the max inde +inline int MaxIndex(Tensor pred) { + int maxidx = 0; + for(index_t i = 1; i < pred.size(0); ++i) { + if(pred[i] > pred[maxidx]) maxidx = (int)i; + } + return maxidx; +} + +namespace mshadow { +namespace ps { +// model updater is used when update is happening on server side +// if we only use parameter server for sum aggregation +// this is not needed, but we must declare this function to return NULL +template<> +IModelUpdater *CreateModelUpdater(void) { + return NULL; +} +} +} + +template +inline int Run(int argc, char *argv[]) { + srand(0); + // settings + int batch_size = 100; + int num_in = 28 * 28; + int num_hidden = 100; + int num_out = 10; + int ndev = argc - 2; + if (batch_size % ndev != 0) { + fprintf(stderr, "choose number of devices ndev such that 100 MOD ndev == 0\n"); + return 0; + } + // choose which version to use + std::vector devs; + for (int i = 2; i < argc; ++i) { + devs.push_back(atoi(argv[i])); + } + mshadow::ps::ISharedModel + *ps = mshadow::ps::CreateSharedModel("local"); + ps->Init(devs); + + std::vector nets(ndev); + for (int i = 0; i < ndev; ++i) { + mshadow::InitTensorEngine(devs[i]); + nets[i] = new NNet(batch_size / ndev, num_in, num_hidden, num_out, devs[i], ps); + } + + // label + std::vector ytrain, ytest; + // data + TensorContainer xtrain, xtest; + LoadMNIST("train-images-idx3-ubyte", "train-labels-idx1-ubyte", ytrain, xtrain, true); + LoadMNIST("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", ytest, xtest, false); + int num_iter = 20; + + for (int i = 0; i < num_iter; ++ i) { + // mini-batch per device + int step = batch_size / ndev; + // running parallel threads + #pragma omp parallel num_threads(ndev) + { + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(step, num_out)); + int tid = omp_get_thread_num(); + mshadow::SetDevice(devs[tid]); + for (index_t j = 0; j + batch_size <= xtrain.size(0); j += batch_size) { + nets[tid]->Forward(xtrain.Slice(j + tid * step, j + (tid + 1) * step), pred); + // set gradient into pred + for (int k = 0; k < step; ++ k) { + pred[k][ytrain[j + tid * step + k]] -= 1.0f; + } + // scale gradient by batchs zie + pred *= 1.0f / batch_size; + // run backprop + nets[tid]->Backprop(pred); + } + } + // evaluation + long nerr = 0; + #pragma omp parallel num_threads(ndev) reduction(+:nerr) + { + // temp output layer + TensorContainer pred; + pred.Resize(Shape2(step, num_out)); + int tid = omp_get_thread_num(); + mshadow::SetDevice(devs[tid]); + for (index_t j = 0; j + batch_size <= xtest.size(0); j += batch_size) { + nets[tid]->Forward(xtest.Slice(j + tid * step, j + (tid + 1) * step), pred); + for (int k = 0; k < step; ++ k) { + nerr += MaxIndex(pred[k]) != ytest[j + tid * step + k]; + } + } + } + printf("round %d: test-err=%f\n", i, (float)nerr/xtest.size(0)); + } + + for(int i = 0; i < ndev; ++i) { + mshadow::SetDevice(devs[i]); + delete nets[i]; + ShutdownTensorEngine(); + } + return 0; +} +int main(int argc, char *argv[]) { + if (argc < 3) { + printf("Usage: devicelist\n"\ + "\tExample1: ./nnet_ps cpu 1 2 3\n"\ + "\tExample2: ./nnet_ps gpu 0 1\n"); + return 0; + } + if (!strcmp(argv[1], "cpu")) { + Run(argc, argv); + } else { + Run(argc, argv); + } + return 0; +} diff --git a/guide/neuralnet/util.h b/guide/neuralnet/util.h new file mode 100644 index 000000000000..f58203c7667a --- /dev/null +++ b/guide/neuralnet/util.h @@ -0,0 +1,86 @@ +#pragma once +#include +#include +#include +#include "mshadow/tensor.h" + +typedef float real_t; + +using namespace mshadow; + +int pack(unsigned char zz[4]){ + return (int)(zz[3]) + | (((int)(zz[2])) << 8) + | (((int)(zz[1])) << 16) + | (((int)(zz[0])) << 24); +} + +template +inline void shuffle(T *data, size_t sz){ + if(sz == 0) return; + for(size_t i = sz - 1; i > 0; i--){ + std::swap(data[i], data[rand() % (i+1)]); + } +} +// random shuffle the data inside, require PRNG +template +inline void shuffle(std::vector &data){ + shuffle(&data[0], data.size()); +} + +// simple function to load in mnist +inline void LoadMNIST(const char *path_img, const char *path_label, + std::vector &ylabel, + TensorContainer &xdata, + bool do_shuffle){ + // load in data + FILE *fi = fopen(path_img, "rb"); + if (fi == NULL) { + printf("cannot open %s\n", path_img); + exit(-1); + } + unsigned char zz[4]; + unsigned char *t_data, *l_data; + int num_image, width, height, nlabel; + assert(fread(zz, 4 , 1, fi)); + assert(fread(zz, 4 , 1, fi)); + num_image = pack(zz); + assert(fread(zz, 4 , 1, fi)); + width = pack(zz); + assert(fread(zz, 4 , 1, fi)); + height = pack(zz); + + int step = width * height; + t_data = new unsigned char[num_image * step]; + assert(fread(t_data, step*num_image , 1 , fi)); + fclose(fi); + + // load in label + fi = fopen(path_label, "rb"); + assert(fread(zz, 4 , 1, fi)); + assert(fread(zz, 4 , 1, fi)); + nlabel = pack(zz); + assert(num_image == nlabel); + l_data = new unsigned char[num_image]; + assert(fread(l_data, num_image , 1 , fi)); + // try to do shuffle + std::vector rindex; + for (int i = 0; i < num_image; ++ i) { + rindex.push_back(i); + } + if (do_shuffle) { + shuffle(rindex); + } + + // save out result + ylabel.resize(num_image); + xdata.Resize(Shape2(num_image, width * height)); + for (int i = 0 ; i < num_image ; ++i) { + for(int j = 0; j < step; ++j) { + xdata[i][j] = (float)(t_data[rindex[i]*step + j]) / 256.0f; + } + ylabel[i] = l_data[rindex[i]]; + } + delete[] t_data; delete [] l_data; + printf("finish loading %dx%d matrix from %s, shuffle=%d\n", num_image, step, path_img, (int)do_shuffle); +} diff --git a/make/README.md b/make/README.md new file mode 100644 index 000000000000..6ef24d6d467c --- /dev/null +++ b/make/README.md @@ -0,0 +1,18 @@ +Makefile Configuration of MShadow +===== +MShadow is a template library, you only need to include mshadow to use it. So this folder is not used to build mshadow library file. + +However, mshadow is a flexible library that allows you to compile with different configurations. For example, +you can compile mshadow without CUDA, and specify your own choice of BLAS. +There are different compile flags that you might need to set in your own configuration. +This folder provides a Makefile script to help you do that. + +Usage +===== +* Set the configurations via variables in your Makefile, see example in [../guide/config.mk](../guide/config.mk) +* include [mshadow.mk](mshadow.mk) in your Makefile +* mshadow.mk will give you compiler variables that you can include when compiling + - Add MSHADOW_CFLAGS to the compile flags + - Add MSHADOW_LDFLAGS to the linker flags + - Add MSHADOW_NVCCFLAGS to the nvcc compile flags +* For example Makefile, see [../guide/Makefile](../guide/Makefile) diff --git a/make/mshadow.mk b/make/mshadow.mk new file mode 100644 index 000000000000..6e7b68b7b989 --- /dev/null +++ b/make/mshadow.mk @@ -0,0 +1,62 @@ +#--------------------------------------------------------------------------------------- +# mshadow configuration script +# +# include mshadow.mk after the variables are set +# +# Add MSHADOW_CFLAGS to the compile flags +# Add MSHADOW_LDFLAGS to the linker flags +# Add MSHADOW_NVCCFLAGS to the nvcc compile flags +#---------------------------------------------------------------------------------------- + +MSHADOW_CFLAGS = -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas +MSHADOW_LDFLAGS = -lm +MSHADOW_NVCCFLAGS = + +ifeq ($(USE_CUDA), 0) + MSHADOW_CFLAGS += -DMSHADOW_USE_CUDA=0 +else + MSHADOW_LDFLAGS += -lcudart -lcublas -lcurand +endif +ifneq ($(USE_CUDA_PATH), NONE) + MSHADOW_CFLAGS += -I$(USE_CUDA_PATH)/include + MSHADOW_LDFLAGS += -L$(USE_CUDA_PATH)/lib64 +endif + +ifeq ($(USE_BLAS), mkl) +ifneq ($(USE_INTEL_PATH), NONE) + MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/mkl/lib/intel64 + MSHADOW_LDFLAGS += -L$(USE_INTEL_PATH)/lib/intel64 + MSHADOW_CFLAGS += -I$(USE_INTEL_PATH)/mkl/include +endif + MSHADOW_LDFLAGS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 +else + MSHADOW_CFLAGS += -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 +endif +ifeq ($(USE_BLAS), openblas) + MSHADOW_LDFLAGS += -lopenblas +else ifeq ($(USE_BLAS), atlas) + MSHADOW_LDFLAGS += -lcblas +else ifeq ($(USE_BLAS), blas) + MSHADOW_LDFLAGS += -lblas +else ifeq ($(USE_BLAS), apple) + MSHADOW_CFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/ + MSHADOW_LDFLAGS += -framework Accelerate +endif + +ifeq ($(PS_PATH), NONE) +PS_PATH = .. +endif +ifeq ($(PS_THIRD_PATH), NONE) +PS_THIRD_PATH = $(PS_PATH)/third_party +endif + +ifeq ($(USE_DIST_PS),1) +MSHADOW_CFLAGS += -DMSHADOW_DIST_PS=1 -std=c++11 \ + -I$(PS_PATH)/src -I$(PS_THIRD_PATH)/include +PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libpsmain.a) \ + $(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \ + libglog.a libz.a libsnappy.a) +MSHADOW_NVCCFLAGS += --std=c++11 +else + MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0 +endif diff --git a/mshadow-ps/.gitignore b/mshadow-ps/.gitignore new file mode 100644 index 000000000000..076c1aa82e8b --- /dev/null +++ b/mshadow-ps/.gitignore @@ -0,0 +1,3 @@ +Makefile +test +test.cpp diff --git a/mshadow-ps/README.md b/mshadow-ps/README.md new file mode 100644 index 000000000000..9c90cc9f3c9d --- /dev/null +++ b/mshadow-ps/README.md @@ -0,0 +1,4 @@ +mshadow-ps +==== +This folder contains mshadow-ps parameter server interface for mshadow GPU/CPU Tensor. See [guide on mshadow-ps](../guide/mshadow-ps) for introduction of the interface. + diff --git a/mshadow-ps/kv_array.h b/mshadow-ps/kv_array.h new file mode 100644 index 000000000000..8f9c96e2807c --- /dev/null +++ b/mshadow-ps/kv_array.h @@ -0,0 +1,125 @@ +#pragma once +#include "parameter/shared_parameter.h" +#include "ps.h" +namespace PS { + +DECLARE_string(app_name); + +template +class KVArray : public SharedParameter { + public: + KVArray(const string& my_name = FLAGS_app_name + "_model", + const string& parent_name = FLAGS_app_name) : + SharedParameter(my_name, parent_name) { } + virtual ~KVArray() { } + + void setArray(int key, V* data, size_t size) { + val_[key] = SArray(data, size, false); + } + void setUpdater(mshadow::ps::IModelUpdater* updater) { + updater_ = updater; + } + + // SArray& array(int key) { return val_[key]; } + + // funcs will be called by the system + MessagePtrList slice(const MessagePtr& msg, const KeyRangeList& krs); + void getValue(const MessagePtr& msg); + void setValue(const MessagePtr& msg); + protected: + std::unordered_map> val_; + // an array is placed into multiple servers only if its length > min_slice_size + size_t min_slice_size_ = 1000; + mshadow::ps::IModelUpdater* updater_ = nullptr; +}; + + +template +void KVArray::setValue(const MessagePtr& msg) { + CHECK_EQ(msg->value.size(), 1); + SArray recv_data(msg->value[0]); + Range kr(msg->task.key_range()); + CHECK_EQ(kr.size(), recv_data.size()); + int key = msg->task.key_channel(); + auto& my_val = val_[key]; + + if (isWorker()) { + if (my_val.empty()) my_val.resize(kr.size(), 0); + CHECK_GE(my_val.size(), kr.end()); + my_val.segment(kr).copyFrom(recv_data); + } else if (isServer()) { + // TODO this server can do flexible consistency control here + + if (my_val.empty()) { + // initialize weight + my_val.resize(kr.size(), 0); + CHECK_NOTNULL(updater_)->InitModel(key, my_val.data(), my_val.size()); + } + + // update weight + CHECK_GE(my_val.size(), kr.size()); + CHECK_NOTNULL(updater_)->Update(key, recv_data.data(), recv_data.size()); + } +} + +// only be called at servers, namely a worker pull data from this server +template +void KVArray::getValue(const MessagePtr& msg) { + auto& my_val = val_[msg->task.key_channel()]; + Range kr(msg->task.key_range()); + if (my_val.empty()) { + // initialize weight + my_val.resize(kr.size(), 0); + CHECK_NOTNULL(updater_)->InitModel(msg->task.key_channel(), my_val.data(), my_val.size()); + } + + // TODO store the kr in memory + CHECK_EQ(my_val.size(), kr.size()); + SArray send_data(kr.size()); + send_data.copyFrom(my_val); + msg->addValue(send_data); +} + +// divide a message into n part, where part i goes to server i. it's a zero-copy +// implementation +template +MessagePtrList KVArray::slice(const MessagePtr& msg, const KeyRangeList& krs) { + // divide the key range + size_t n = krs.size(); + MessagePtrList ret(n); + Range kr(msg->task.key_range()); + for (size_t i = 0; i < n; ++i) { + ret[i] = MessagePtr(new Message()); + ret[i]->miniCopyFrom(*msg); + ret[i]->valid = true; + auto mut_kr = ret[i]->task.mutable_key_range(); + if (kr.size() < min_slice_size_) { + if (i == 0) { + // server 0 get all data + kr.to(mut_kr); + } else { + Range(0,0).to(mut_kr); + // do not sent to server 1 - n + ret[i]->valid = false; + } + } else { + kr.evenDivide(n, i).to(mut_kr); + } + } + + // divide the data + for (size_t i = 0; i < msg->value.size(); ++i) { + SArray data(msg->value[i]); + CHECK_EQ(data.size(), kr.size()); + for (size_t j = 0; j < n; ++j) { + if (ret[j]->valid) { + Range kr(ret[j]->task.key_range()); + ret[j]->addValue(data.segment(kr)); + } + } + } + return ret; +} + + +} // namespace PS diff --git a/mshadow-ps/ps.h b/mshadow-ps/ps.h new file mode 100644 index 000000000000..6e6b08d2bd64 --- /dev/null +++ b/mshadow-ps/ps.h @@ -0,0 +1,303 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps.h + * \brief parameter server abstraction for mshadow tensor + * this is a plugin of mshadow that can be used to syncrhonize + * parameters across device and machines + * + * \author Tianqi Chen, Mu Li + */ +#ifndef MSHADOW_PS_H_ +#define MSHADOW_PS_H_ +#include +// optionally support of lambda function in C++11, if available +#if __cplusplus >= 201103L +#include +#endif // C++11 +#include "../mshadow/tensor.h" + +/*! \brief whether to adapt distributed PS from parameter-server */ +#ifndef MSHADOW_DIST_PS +#define MSHADOW_DIST_PS 1 +#endif + +namespace mshadow { +/*! \brief namespace of mshadow-ps */ +namespace ps { +/*! + * \brief interface of parameter server + * \tparam xpu the device of the data lies + * \tparam DType the type of element in the tensor + */ +template +class ISharedModel { + public: + /*! + * \brief callback function that will be executed when pull request finishes + * before calling the callback, the thread context is already switched + * to the device of pullrequest + * \param stream the stream of callback thread, it is recommended to operate using this stream + * \param arg the argument of callback function + */ + typedef void (CallbackFunction) (Stream *stream, void *arg); + /*! \brief virtual destructor */ + virtual ~ISharedModel(void) {} + /*! + * \brief Set param for the layer from string + * \param name parameter name + * \param val string for configuration + */ + virtual void SetParam(const char *name, const char *val) {} + /*! + * \brief initialize the paramerver server client + * \param devices specifies the possible device id + * to be input from Push and Pull, + */ + virtual void Init(const std::vector &devices) {} + /*! + * \brief initialize the paramerver server client + * without specifying the devices, only device 0 is allowed + */ + inline void Init(void) { + std::vector dev; + dev.push_back(0); + this->Init(dev); + } + /*! + * \brief initialize a key with certain shape + * must be called before using Push/PullReq/PullWait + * on the corresponding key + * \param shape the shape content of the key + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + template + inline void InitKey(Shape shape, + int key, int devid) { + this->InitKey_(shape.FlatTo2D(), key, devid); + } + /*! + * \brief wait until the pull event finishes + * if there was no pull request, wait will directly returns + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void PullWait(int key, int devid = 0) = 0; + /*! + * \brief push out a tensor to parameter server + * this call is asynchronize and returns immediately + * + * \param data the data + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + */ + template + inline void Push(Tensor data, + int key, + int devid = 0, + int priority = 0) { + this->Push_(data.FlatTo2D(), key, devid, priority); + } + /*! + * \brief send a pull request, to pull parameter into data + * this call is asynchronize and returns immediately + * use PullWait to wait the event of copy finish + * + * \param data the data + * \param key the unique key to indicate the tensor, + * this is unique per device + * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + * \param callback the callback function that will + * be invoked when the request finishes + * \param callback_arg the argument to pass to callback + */ + template + inline void PullReq(Tensor data, + int key, + int devid = 0, + int priority = 0, + CallbackFunction callback = NULL, + void *callback_arg = NULL) { + this->PullReq_(data.FlatTo2D(), key, + devid, priority, callback, callback_arg); + } +#if __cplusplus >= 201103L + /*! + * \brief send a pull request, to pull parameter into data + * this call is asynchronize and returns immediately + * use PullWait to wait the event of copy finish + * this is the c++11 version that allows lambda function as callback + * \param data the data + * \param key the unique key to indicate the tensor, + * this is unique per device + * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + * \param callback the callback function + */ + template + inline void PullReq(Tensor data, + int key, + int devid, + int priority, + std::function *stream)> callback) { + // need to allocate space, because callback can happen latter.. + auto calbk = new std::function *stream)>(); + *calbk = callback; + this->PullReq(data, key, devid, priority, InvokeLambda_, calbk); + } +#endif // C++11 + protected: + /*! + * \brief initialize a key with certain shape + * \param shape the shape content of the key + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + */ + virtual void InitKey_(Shape<2> shape, + int key, int devid) = 0; + /*! + * \brief push out a tensor to parameter server + * this call is asynchronize and returns immediately + * + * \param data the data + * \param key the unique key to indicate the tensor + * this is unique per device + * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + */ + virtual void Push_(Tensor data, + int key, + int devid = 0, + int priority = 0) = 0; + /*! + * \brief send a pull request, to pull parameter into data + * this call is asynchronize and returns immediately + * use PullWait to wait the event of copy finish + * + * \param data the data + * \param key the unique key to indicate the tensor, + * this is unique per device + * \param devid the device id this tensor lies in + * \param priority the priority of this operation, + * the bigger the number is the higher the priority will be + * \param callback the callback function that will + * be invoked when the request finishes + * \param callback_arg the argument to pass to callback + */ + virtual void PullReq_(Tensor data, + int key, + int devid, + int priority, + CallbackFunction callback, + void *callback_arg) = 0; + + private: +// C++11 support for lambda prepare function +#if __cplusplus >= 201103L + /*! \brief hack function to convert lambda to callback function */ + inline static void InvokeLambda_(Stream *stream, void *fun) { + auto *fp = static_cast *stream)>*>(fun); + (*fp)(stream); + delete fp; + } +#endif // C++11 +}; +/*! \brief interface for customized mshadow server */ +template +class IModelUpdater { + public: + virtual ~IModelUpdater(void) {} + /*! + * \brief set parameters from outside + * \param name name of parameter + * \param val value of parameter + */ + virtual void SetParam(const char *name, const char *val) {} + /*! + * \brief init the model updater + * \param rank the rank of the node + * \param conf configuration + */ + virtual void InitUpdater(int rank, const std::string &conf) {} + /*! + * \brief initialize the model + * \param key the key of data we point to + * \param dptr the data pointer + * \param size size of the parameter key + */ + virtual void InitModel(int key, DType *dptr, size_t size) { + this->InitModel_(key, Tensor(dptr, Shape1(size))); + } + /*! + * update the model + * \param key the key of data we point to + * \param dptr the data pointer + * \param size size of the parameter key + */ + virtual void Update(int key, DType *dptr, size_t size) { + this->Update_(key, Tensor(dptr, Shape1(size))); + } + + protected: + /*! + * \brief initialize the model, user can implement this one + * to take advantage of tensor operations + * \param key the key of data we point to + * \param data the tensor data corresponding to the data we want to initialize + */ + virtual void InitModel_(int key, Tensor data) { + utils::Error("InitModel: not implemented"); + } + /*! + * \brief update the model, user can implement this one + * to take advantage of tensor operations + * \param key the key of data we point to + * \param data the tensor data corresponding to the data we want to initialize + */ + virtual void Update_(int key, Tensor data) { + utils::Error("InitModel: not implemented"); + } +}; +/*! + * \brief create customized server + * this is a server defined by user + * \return new server + */ +template +IModelUpdater *CreateModelUpdater(void); +} // namespace ps +} // namespace mshadow + +#include "./ps_local-inl.h" +#include "./ps_dist-inl.h" +namespace mshadow { +namespace ps { +/*! + * \brief create a parameter server implementation + * \param type the type of paramerver server + * can either be "local" or "dist" + * \return the ISharedModel that can be used to synchronize weights + */ +template +inline ISharedModel *CreateSharedModel(const char *type) { + if (!strcmp("local", type)) return new LocalModel(); +#if MSHADOW_DIST_PS + if (!strcmp("dist", type)) return new DistModel(); +#endif + utils::Error("unknown server type %s\n", type); + return NULL; +} +} // namespace ps +} // namespace mshadow +#endif diff --git a/mshadow-ps/ps_dist-inl.h b/mshadow-ps/ps_dist-inl.h new file mode 100644 index 000000000000..ed955e9da6a1 --- /dev/null +++ b/mshadow-ps/ps_dist-inl.h @@ -0,0 +1,117 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps_local-inl.h + * \brief local multi-threading implementation of PS abstraction + * + * \author Tianqi Chen, Mu Li + */ +#ifndef MSHADOW_PS_DIST_INL_H_ +#define MSHADOW_PS_DIST_INL_H_ +#include "./ps.h" +#include "./ps_local-inl.h" + +#if MSHADOW_DIST_PS +#include "./kv_array.h" +#include "system/app.h" +namespace mshadow { +namespace ps { +template +class DistModel : public LocalModel { + public: + // parent type + typedef LocalModel Parent; + + // initialize the parameter server + virtual void Init(const std::vector &devices) { + Parent::Init(devices); + shared_model_ = new PS::KVArray(); + if (this->custom_server != NULL) { + delete this->custom_server; + this->custom_server = NULL; + } + } + virtual ~DistModel(void) { + } + + protected: + // do nothing + virtual void InitCustomerServer(void) { + } + virtual void ServerInitKey(Tensor weight, int key) { + // this is called when key get initialized for the first time + // weight can be used to hold the model that pulled back + // use this to initialize the key on serverside + using namespace PS; + MessagePtr pull_msg(new Message(kServerGroup)); + pull_msg->task.set_key_channel(key); + Range(0, weight.MSize()).to(pull_msg->task.mutable_key_range()); + shared_model_->setArray(key, weight.dptr_, weight.MSize()); + pull_msg->fin_handle = [this, weight, key]() { + // call PullReady to notify LocalServer pulling is ready + this->PullReady(weight, key); + }; + shared_model_->pull(pull_msg); + } + // override this function, to use parameter server + virtual void HandlePushFinish(Tensor data, + int key) { + // here we only use sum reduction, can change to others + for (index_t i = 1; i < data.size(0); ++i) { + data[0] += data[i]; + } + + // push + Tensor sendrecv = data[0]; + using namespace PS; + utils::Assert(data[0].CheckContiguous(), "data must be contiguous"); + SArray val; val.copyFrom(sendrecv.dptr_, sendrecv.MSize()); + MessagePtr push_msg(new Message(kServerGroup)); + push_msg->addValue(val); + // LL << val; + push_msg->task.set_key_channel(key); + Range(0, val.size()).to(push_msg->task.mutable_key_range()); + int push_time = CHECK_NOTNULL(shared_model_)->push(push_msg); + + // pull + MessagePtr pull_msg(new Message(kServerGroup, -1, push_time)); + pull_msg->task.set_key_channel(key); + Range(0, sendrecv.MSize()).to(pull_msg->task.mutable_key_range()); + shared_model_->setArray(key, sendrecv.dptr_, sendrecv.MSize()); + pull_msg->fin_handle = [this, sendrecv, key]() { + // call PullReady to notify LocalServer pulling is ready + this->PullReady(sendrecv, key); + }; + shared_model_->pull(pull_msg); + } + + private: + PS::KVArray* shared_model_ = nullptr; +}; + +template +class MShadowServerNode : public PS::App { + public: + // conf: get from the flag -app_conf + MShadowServerNode(const std::string &conf) : App() { + updater_ = CreateModelUpdater(); + + updater_->InitServer(myRank(), conf); + shared_model_ = new PS::KVArray(); + shared_model_->setUpdater(updater_); + } + virtual ~MShadowServerNode() { + delete updater_; + delete shared_model_; + } + private: + IModelUpdater *updater_; + PS::KVArray* shared_model_; +}; + +// NOTE: do not add PS::CreateServer here add it in the program that uses +// mshadow-ps + +} // namespace ps +} // namespace msahdow +#endif +#endif diff --git a/mshadow-ps/ps_local-inl.h b/mshadow-ps/ps_local-inl.h new file mode 100644 index 000000000000..fa092dc68bce --- /dev/null +++ b/mshadow-ps/ps_local-inl.h @@ -0,0 +1,734 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file ps_local-inl.h + * \brief local multi-threading implementation of PS abstraction + * + * \author Tianqi Chen, Mu Li + */ +#ifndef MSHADOW_PS_LOCAL_INL_H_ +#define MSHADOW_PS_LOCAL_INL_H_ +#include +#include +#if defined(_OPENMP) +#include +#ifdef _MSC_VER +typedef int ms_omp_uint; +#else +typedef unsigned ms_omp_uint; +#endif +#endif + +#include "./thread.h" +#include "./thread_util.h" + +namespace mshadow { +namespace ps { +// multi-threaded implementation of +template +class LocalModel : public ISharedModel { + public: + // redefine callback function + typedef typename ISharedModel::CallbackFunction + CallbackFunction; + // constructor + LocalModel(void) { + init_end = 0; + perdev_pull_thread = 1; + perdev_push_thread = 1; + bigarray_bound = 1000 * 1000; + nthread_reduction = 8; + use_pin_memory = 1; + destroy_signal = false; + custom_server = NULL; + } + // destructor + virtual ~LocalModel(void) { + if (init_end != 0) { + destroy_signal = true; + for (size_t i = 0; i < push_queues.size(); ++i) { + push_queues[i].Abort(1); + } + for (size_t i = 0; i < pull_queues.size(); ++i) { + pull_queues[i].Abort(1); + } + for (size_t i = 0; i < thread_push_handler.size(); ++i) { + thread_push_handler[i].Join(); + } + for (size_t i = 0; i < thread_pull_handler.size(); ++i) { + thread_pull_handler[i].Join(); + } + for (size_t i = 0; i < push_queues.size(); ++i) { + push_queues[i].Destroy(); + } + push_map.Destroy(); + push_lock.Destroy(); + for (size_t i = 0; i < pull_queues.size(); ++i) { + pull_queues[i].Destroy(); + } + pull_map.Destroy(); + request_lock.Destroy(); + wait_lock.Destroy(); + wait_cond.Destroy(); + } + if (custom_server != NULL) delete custom_server; + } + virtual void SetParam(const char *name, const char *val) { + int key; + if (sscanf(name, "push_op[%d]", &key) == 1) { + if (!strcmp(val, "gather")) { + request_lock.Lock(); + push_operation[key] = kGather; + request_lock.Unlock(); + return; + } + if (!strcmp(val, "sum")) { + push_operation[key] = kSum; return; + } + utils::Error("unknown push operation %s", val); + } + if (!strcmp(name, "reduce_thread")) { + nthread_reduction = atoi(val); + } + if (!strcmp(name, "use_pin_memory")) { + use_pin_memory = atoi(val); + } + if (!strcmp(name, "bigarray_bound")) { + bigarray_bound = static_cast(atol(val)); + } + if (!strcmp(name, "pull_thread")) { + if (!strcmp(val, "ndev")) { + perdev_pull_thread = 1; + } else if (!strcmp(val, "one")) { + perdev_pull_thread = 0; + } else { + utils::Error("invalid value for parameter pull_thread,"\ + " can only be ndev or one"); + } + } + if (!strcmp(name, "push_thread")) { + if (!strcmp(val, "ndev")) { + perdev_push_thread = 1; + } else if (!strcmp(val, "one")) { + perdev_push_thread = 0; + } else { + utils::Error("invalid value for parameter push_thread,"\ + " can only be ndev or one"); + } + } + if (!strcmp(name, "update_on_server")) { + update_on_server = atoi(val); + } + cfgvec.push_back(std::make_pair(std::string(name), + std::string(val))); + } + virtual void PullWait(int key, int devid) { + const int wid = GetWorkIndex(devid); + PullEntry *p = pull_map.Get(key); + if (p == NULL || p->wait.size() == 0) return; + PullEntry &e = *p; + // wake up waiters if any + utils::Assert(e.wait.size() == devices.size(), + "PullWait: must initialize the wait"); + PullWaitRecord &w = e.wait[wid]; + if (!w.finished) { + wait_lock.Lock(); + w.nwait += 1; + while (!w.finished) { + wait_cond.Wait(&wait_lock); + } + w.nwait -= 1; + utils::Assert(w.nwait >= 0, "boundary check"); + wait_lock.Unlock(); + } + } + virtual void Init(const std::vector &devices) { + utils::Check(init_end == 0, + "LocalServer.Init can only call Init once"); + utils::Check(devices.size() != 0, + "LocalServer.Init: must at least contain 1 devices"); + this->devices = devices; + destroy_signal = false; + // initialize device id to local index + dev2index.clear(); + for (size_t i = 0; i < devices.size(); ++i) { + int devid = devices[i]; + utils::Assert(devid >= 0, "device id must be bigger than 0"); + if (devid >= static_cast(dev2index.size())) { + dev2index.resize(devid + 1, -1); + } + dev2index[devid] = static_cast(i); + } + // allocate space + pull_stream.resize(devices.size()); + push_stream.resize(devices.size()); + // initialize all the thread related things + if (perdev_push_thread != 0) { + push_queues.resize(devices.size()); + } else { + push_queues.resize(1); + } + for (size_t i = 0; i < push_queues.size(); ++i) { + push_queues[i].Init(); + } + push_map.Init(); + push_lock.Init(); + pull_map.Init(); + request_lock.Init(); + wait_lock.Init(); + wait_cond.Init(); + if (perdev_pull_thread != 0) { + pull_queues.resize(devices.size()); + } else { + pull_queues.resize(1); + } + for (size_t i = 0; i < pull_queues.size(); ++i) { + pull_queues[i].Init(); + } + // initialize the thread + if (perdev_push_thread != 0) { + thread_push_handler.resize(devices.size()); + for (size_t i = 0; i < devices.size(); ++i) { + std::pair *p + = new std::pair(); + *p = std::make_pair(this, i); + thread_push_handler[i].Start(PushLocalThread, p); + } + } else { + thread_push_handler.resize(1); + thread_push_handler[0].Start(PushGlobalThread, this); + } + // initialize pull handler + if (perdev_pull_thread != 0) { + thread_pull_handler.resize(devices.size()); + for (size_t i = 0; i < devices.size(); ++i) { + std::pair *p + = new std::pair(); + *p = std::make_pair(this, i); + thread_pull_handler[i].Start(PullLocalThread, p); + } + } else { + thread_pull_handler.resize(1); + thread_pull_handler[0].Start(PullGlobalThread, this); + } + this->InitCustomerServer(); + this->init_end = 1; + } + + protected: + /*! \brief operation performed locally in PS */ + enum LocalOp { + /*! \brief take sum of all devices over the same key */ + kSum = 0, + /*! + * \brief concatenate(gather), + * the tensors in all devices with same key + */ + kGather = 1 + }; + virtual void InitKey_(Shape<2> shape, + int key, int devid) { + this->InitPullMap(key); + this->InitPushMap(key, shape); + } + + virtual void Push_(Tensor data, + int key, int devid, int priority) { + PullEntry &e = pull_map.GetRef(key); + e.req[GetWorkIndex(devid)].ready = false; + if (perdev_push_thread != 0) { + int wid = GetWorkIndex(devid); + push_queues[wid].Push(PullTask(data, key, devid), priority); + } else { + push_queues[0].Push(PullTask(data, key, devid), priority); + } + } + virtual void PullReq_(Tensor data, + int key, int devid, int priority, + CallbackFunction callback, + void *callback_arg) { + PullEntry &e = pull_map.GetRef(key); + utils::Assert(e.req.size() == devices.size(), + "PullReq: must initialize the key, req"); + utils::Assert(e.wait.size() == devices.size(), + "PullReq: must initialize the key, wait"); + const int wid = GetWorkIndex(devid); + PullReqRecord &r = e.req[wid]; + r.dest = data; + r.priority = priority; + r.callback = callback; + r.callback_arg = callback_arg; + // reset pull request finish mark + wait_lock.Lock(); + e.wait[wid].finished = false; + wait_lock.Unlock(); + // check ready event + request_lock.Lock(); + utils::Check(!r.pending, + "key = %d, cannot send duplicate pull request before it finishes", + key); + if (e.req[wid].ready) { + if (perdev_pull_thread != 0) { + pull_queues[wid].Push(std::make_pair(key, devid)); + } else { + pull_queues[0].Push(std::make_pair(key, devid)); + } + } else { + r.pending = true; + } + request_lock.Unlock(); + } + /*! + * \brief called to notify that the data is ready for pull + * \param data the data that can be pulled back + * \param the key of the data + */ + virtual void PullReady(Tensor data, int key) { + PullEntry &e = pull_map.GetRef(key); + utils::Assert(e.req.size() == devices.size(), + "PullReady: must initialize the key, req"); + request_lock.Lock(); + e.src = data; + for (index_t i = 0; i < e.req.size(); ++i) { + e.req[i].ready = true; + if (e.req[i].pending) { + if (perdev_pull_thread != 0) { + pull_queues[i].Push(std::make_pair(key, devices[i])); + } else { + pull_queues[0].Push(std::make_pair(key, devices[i])); + } + e.req[i].pending = false; + } + } + request_lock.Unlock(); + } + virtual void ServerInitKey(Tensor weight, int key) { + if (custom_server != NULL) { + // intialize server, and ready for pullback + custom_server->InitModel(key, weight.dptr_, weight.MSize()); + this->PullReady(weight, key); + } + } + /*! + * \brief event handler for push finish + * called when all the data with same key comes int + * \param data the buffer holds the data in all devices + * \param result_buffer temporal buffer to hold the reduction result + * \param key the key of the data + */ + virtual void HandlePushFinish(Tensor data, + int key) { + LocalOp op = kSum; + typename std::map::const_iterator + it = push_operation.find(key); + if (it != push_operation.end() && it->first == key) { + op = it->second; + } + // customized server + if (custom_server != NULL) { + this->ReduceSum(data); + custom_server->Update(key, data[0].dptr_, data[0].MSize()); + PushEntry &e = push_map.GetRef(key); + this->PullReady(e.weight, key); + return; + } + switch (op) { + case kSum: { + this->ReduceSum(data); + this->PullReady(data[0], key); + return; + } + case kGather: { + this->PullReady(data.FlatTo2D(), key); + return; + } + default: utils::Error("unknown LocalOp"); + } + } + + virtual void InitCustomerServer(void) { + if (update_on_server != 0) { + custom_server = CreateModelUpdater(); + for (size_t j = 0; j < cfgvec.size(); ++j) { + custom_server->SetParam(cfgvec[j].first.c_str(), + cfgvec[j].second.c_str()); + } + custom_server->InitUpdater(0, std::string()); + } + } + protected: + // customized server + IModelUpdater *custom_server; + private: + /*! \brief task running */ + struct PullTask { + /*! \brief the task data source */ + Tensor data; + /*! \brief the key to the tensor */ + int key; + /*! + * \brief the device id, (key,devid), + * uniquely identifies a mem location + */ + int devid; + PullTask(void) {} + PullTask(Tensor data, int key, int devid) + : data(data), key(key), devid(devid) {} + }; + /*! \brief data structure to hold temporal push result */ + struct PushEntry { + // temporal space to hold input data + Tensor data; + // temporal space to hold weight, if needed + Tensor weight; + // indicator whether the certain devices is already copied in + std::vector copied; + // number of data copied in + int num_copied; + // version number of data used to hold incomming data in push + int copyin_version; + // use pinned memory + bool pin_memory; + // constructor + PushEntry(void) + : copyin_version(0) { + weight.dptr_ = NULL; + } + ~PushEntry(void) { + if (data.dptr_ != NULL) { + if (pin_memory) { + mshadow::FreeHost(&data); + if (weight.dptr_ != NULL) { + mshadow::FreeHost(&weight); + } + } else { + mshadow::FreeSpace(&data); + if (weight.dptr_ != NULL) { + mshadow::FreeSpace(&weight); + } + } + } + } + // constructor + inline void Init(int ndevice, Shape<2> shape, + bool pin_memory, bool need_weight) { + this->pin_memory = pin_memory; + data.shape_ = Shape4(2, ndevice, shape[0], shape[1]); + weight.shape_ = shape; + if (pin_memory) { + mshadow::AllocHost(&data); + if (need_weight) mshadow::AllocHost(&weight); + } else { + mshadow::AllocSpace(&data, false); + if (need_weight) mshadow::AllocSpace(&weight); + } + utils::Assert(data.CheckContiguous(), "Init"); + utils::Assert(!need_weight || weight.CheckContiguous(), "Init"); + num_copied = 0; + copied.resize(ndevice, false); + } + }; + // a record to remember things related to pull request + struct PullReqRecord { + // whether this record contains a pending request + // whether pull is ready to go + bool ready; + // waiting for pull ready + bool pending; + // the destination to pull data into + Tensor dest; + // the priority of the + int priority; + // callback function + CallbackFunction *callback; + // argument for callback + void *callback_arg; + PullReqRecord(void) : ready(false), pending(false) { + } + }; + // a record to help handle pullwait + struct PullWaitRecord { + // number of thread that waits for the request to finish + int nwait; + // the request was finished + bool finished; + PullWaitRecord(void) + : nwait(0), finished(true) { + // set finished to true so pull without pull request returns + } + }; + /*! \brief data structure to hold pull request */ + struct PullEntry { + // data to be pulled back + Tensor src; + // pullrequest record + std::vector req; + // whether there is thread waiting on this event + std::vector wait; + PullEntry(void) { + } + }; + // signal to notify all the thread about class destruction + bool destroy_signal; + // vector of devices + std::vector devices; + // device index to local index + std::vector dev2index; + //----- data structure used to support push ---- + // stream used by push thread each device for memcpy + std::vector*> push_stream; + // the queue used for push task + std::vector > push_queues; + // thread to handle push task + std::vector thread_push_handler; + // lock to lock push field + utils::Mutex push_lock; + // the map of push buffer + utils::ThreadSafeMap push_map; + // customized local reduction operation + std::map push_operation; + //----- data structure used to support pull ---- + // the queue used for pull task + std::vector > > pull_queues; + // stream used by pull thread each device for memcpy + std::vector*> pull_stream; + // the map to store pull status + utils::ThreadSafeMap pull_map; + // thread to handle pull task + std::vector thread_pull_handler; + // lock to lock request field + utils::Mutex request_lock; + // lock to lock wait field + utils::Mutex wait_lock; + // conditional variable to do waiting + utils::ConditionVariable wait_cond; + //---------configurations of server------- + int init_end; + // whether perform update on serverside + int update_on_server; + // use pinned memory + int use_pin_memory; + // number of reduction thread + int nthread_reduction; + // the threshold for big array + size_t bigarray_bound; + // whether use pull thread per device + int perdev_pull_thread; + // whether use push thread per device + int perdev_push_thread; + /*! \brief history of configurations */ + std::vector< std::pair > cfgvec; + // perform sum reduction + inline void ReduceSum(Tensor data) { + #if defined(_OPENMP) + if (data[0].MSize() >= bigarray_bound && + nthread_reduction != 0) { + ms_omp_uint ntask = static_cast(data.size(1)); + #pragma omp parallel for schedule(static) num_threads(nthread_reduction) + for (ms_omp_uint j = 0; j < ntask; ++j) { + for (index_t i = 1; i < data.size(0); ++i) { + data[0][j] += data[i][j]; + } + } + } else + #endif + { + for (index_t i = 1; i < data.size(0); ++i) { + data[0] += data[i]; + } + } + } + // push handler + inline void PushProc(utils::ThreadPQueue *queue) { + while (!destroy_signal) { + PullTask tsk; + if (queue->Pop(&tsk)) { + const int wid = GetWorkIndex(tsk.devid); + PushEntry &e = push_map.GetRef(tsk.key); + utils::Check(e.data[0][0].shape_ == tsk.data.shape_, + "Tensor with same key must share same shape"); + utils::Assert(!e.copied[wid], "data inconsistency"); + // start copy + SetDevice(tsk.devid); + Copy(e.data[e.copyin_version][wid], tsk.data, push_stream[wid]); + // wait till the copy finishes + push_stream[wid]->Wait(); + // mark copied + e.copied[wid] = true; + push_lock.Lock(); + e.num_copied += 1; + int cp_version = e.copyin_version; + bool push_finish = e.num_copied >= static_cast(devices.size()); + if (push_finish) { + // switch version + e.copyin_version = (e.copyin_version + 1) % e.data.size(0); + std::fill(e.copied.begin(), e.copied.end(), false); + e.num_copied = 0; + } + push_lock.Unlock(); + if (push_finish) { + this->HandlePushFinish(e.data[cp_version], tsk.key); + } + } else { + utils::Assert(destroy_signal, "abort but not destroy"); + } + } + } + inline void PushHandlerGlobal(void) { + // allocate stream resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + push_stream[i] = NewStream(); + } + this->PushProc(&push_queues[0]); + // free resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + DeleteStream(push_stream[i]); + } + } + inline void PushHandlerLocal(size_t tid) { + utils::Assert(tid < devices.size(), "threadid exceed boundary"); + utils::Assert(push_queues.size() == devices.size(), + "must have one pull_queue per device"); + // allocate stream resources + SetDevice(devices[tid]); + push_stream[tid] = NewStream(); + this->PushProc(&push_queues[tid]); + SetDevice(devices[tid]); + DeleteStream(push_stream[tid]); + } + /*!\brief entry point of loader thread */ + inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) { + static_cast(pthread)->PushHandlerGlobal(); + utils::ThreadExit(NULL); + return NULL; + } + inline static MSHADOW_THREAD_PREFIX PushLocalThread(void *arg) { + std::pair *p + = static_cast*>(arg); + p->first->PushHandlerLocal(p->second); + delete p; + utils::ThreadExit(NULL); + return NULL; + } + // push handler procedure + inline void PullProc(utils::ThreadPQueue > *queue) { + while (!destroy_signal) { + std::pair tsk; + if (queue->Pop(&tsk)) { + const int key = tsk.first; + const int devid = tsk.second; + const int wid = GetWorkIndex(devid); + PullEntry &e = pull_map.GetRef(key); + { + // handle request + utils::Assert(e.req.size() == devices.size(), + "PullHandler: must initialize the key, req"); + PullReqRecord &r = e.req[wid]; + SetDevice(devid); + Copy(r.dest, e.src, pull_stream[wid]); + // callback, if any + if (r.callback != NULL) { + (*r.callback)(pull_stream[wid], r.callback_arg); + } + // wait till the operation finishes + pull_stream[wid]->Wait(); + } + { + // wake up waiters if any + utils::Assert(e.wait.size() == devices.size(), + "PullHandler, must initialize the key, req"); + PullWaitRecord &w = e.wait[wid]; + wait_lock.Lock(); + w.finished = true; + if (w.nwait != 0) { + wait_cond.Broadcast(); + } + wait_lock.Unlock(); + } + } else { + utils::Assert(destroy_signal, "abort but not destroy"); + } + } + } + // use one thread for all pull actions + inline void PullHandlerGlobal(void) { + // allocate stream resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + pull_stream[i] = NewStream(); + } + this->PullProc(&pull_queues[0]); + // free resources + for (size_t i = 0; i < devices.size(); ++i) { + SetDevice(devices[i]); + DeleteStream(pull_stream[i]); + } + } + inline void PullHandlerLocal(size_t tid) { + utils::Assert(tid < devices.size(), "threadid exceed boundary"); + utils::Assert(pull_queues.size() == devices.size(), + "must have one pull_queue per device"); + // allocate stream resources + SetDevice(devices[tid]); + pull_stream[tid] = NewStream(); + this->PullProc(&pull_queues[tid]); + SetDevice(devices[tid]); + DeleteStream(pull_stream[tid]); + } + /*!\brief entry point of pull thread, one thread for all devices */ + inline static MSHADOW_THREAD_PREFIX PullGlobalThread(void *arg) { + static_cast(arg)->PullHandlerGlobal(); + utils::ThreadExit(NULL); + return NULL; + } + inline static MSHADOW_THREAD_PREFIX PullLocalThread(void *arg) { + std::pair *p + = static_cast*>(arg); + p->first->PullHandlerLocal(p->second); + delete p; + utils::ThreadExit(NULL); + return NULL; + } + // get internal index of device + inline int GetWorkIndex(int devid) const { + utils::Check(devid >= 0 && + devid < static_cast(dev2index.size()) && + dev2index[devid] >= 0, + "Push: invalid devid"); + return dev2index[devid]; + } + // functions to handle pull + inline void InitPullMap(int key) { + pull_map.Init(key); + PullEntry &e = pull_map.GetRef(key); + request_lock.Lock(); + // must recheck after lock + if (e.req.size() == 0) { + e.req.resize(devices.size(), PullReqRecord()); + } + request_lock.Unlock(); + // check wait map + wait_lock.Lock(); + // must recheck after lock + if (e.wait.size() == 0) { + e.wait.resize(devices.size(), PullWaitRecord()); + } + wait_lock.Unlock(); + } + // functions to handle pull + inline void InitPushMap(int key, Shape<2> shape) { + push_map.Init(key); + PushEntry &e = push_map.GetRef(key); + push_lock.Lock(); + if (e.copied.size() == 0) { + e.Init(devices.size(), shape, + use_pin_memory != 0, update_on_server != 0); + } + this->ServerInitKey(e.weight, key); + push_lock.Unlock(); + } +}; +} // namespace ps +} // namespace mshadow +#endif // MSHADOW_PS_LOCAL_INL_H_ diff --git a/mshadow-ps/thread.h b/mshadow-ps/thread.h new file mode 100644 index 000000000000..382e17a447bf --- /dev/null +++ b/mshadow-ps/thread.h @@ -0,0 +1,251 @@ +#ifndef MSHADOW_PS_THREAD_H_ +#define MSHADOW_PS_THREAD_H_ +/*! + * \file thread.h + * \brief this header include the minimum necessary resource for multi-threading that can be compiled in windows, linux, mac + * \author Tianqi Chen + */ +#ifdef _MSC_VER +#include "../mshadow/utils.h" +#include +#include +namespace mshadow { +namespace utils { +/*! \brief simple semaphore used for synchronization */ +class Semaphore { + public : + inline void Init(int init_val) { + sem = CreateSemaphore(NULL, init_val, 10, NULL); + utils::Check(sem != NULL, "create Semaphore error"); + } + inline void Destroy(void) { + CloseHandle(sem); + } + inline void Wait(void) { + utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error"); + } + inline void Post(void) { + utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error"); + } + private: + HANDLE sem; +}; + +/*! \brief mutex under windows */ +class Mutex { + public: + inline void Init(void) { + utils::Check(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400) != 0, + "Mutex::Init fail"); + } + inline void Lock(void) { + EnterCriticalSection(&mutex); + } + inline void Unlock(void) { + LeaveCriticalSection(&mutex); + } + inline void Destroy(void) { + DeleteCriticalSection(&mutex); + } + + private: + friend class ConditionVariable; + CRITICAL_SECTION mutex; +}; + +// conditional variable that uses pthread +class ConditionVariable { + public: + // initialize conditional variable + inline void Init(void) { + InitializeConditionVariable(&cond); + } + // destroy the thread + inline void Destroy(void) { + //DeleteConditionVariable(&cond); + } + // wait on the conditional variable + inline void Wait(Mutex *mutex) { + utils::Check(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE) != 0, + "ConditionVariable:Wait fail"); + } + inline void Broadcast(void) { + WakeAllConditionVariable(&cond); + } + inline void Signal(void) { + WakeConditionVariable(&cond); + } + + private: + CONDITION_VARIABLE cond; +}; + +/*! \brief simple thread that wraps windows thread */ +class Thread { + private: + HANDLE thread_handle; + unsigned thread_id; + public: + inline void Start(unsigned int __stdcall entry(void*), void *param) { + thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id); + } + inline int Join(void) { + WaitForSingleObject(thread_handle, INFINITE); + return 0; + } +}; +/*! \brief exit function called from thread */ +inline void ThreadExit(void *status) { + _endthreadex(0); +} +#define MSHADOW_THREAD_PREFIX unsigned int __stdcall +} // namespace utils +} // namespace mshadow +#else +// thread interface using g++ +#include +#include +#include +namespace mshadow { +namespace utils { +/*!\brief semaphore class */ +class Semaphore { + #ifdef __APPLE__ + private: + sem_t* semPtr; + char sema_name[20]; + private: + inline void GenRandomString(char *s, const int len) { + static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ; + for (int i = 0; i < len; ++i) { + s[i] = alphanum[rand() % (sizeof(alphanum) - 1)]; + } + s[len] = 0; + } + public: + inline void Init(int init_val) { + sema_name[0]='/'; + sema_name[1]='s'; + sema_name[2]='e'; + sema_name[3]='/'; + GenRandomString(&sema_name[4], 16); + if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) { + perror("sem_open"); + exit(1); + } + utils::Check(semPtr != NULL, "create Semaphore error"); + } + inline void Destroy(void) { + if (sem_close(semPtr) == -1) { + perror("sem_close"); + exit(EXIT_FAILURE); + } + if (sem_unlink(sema_name) == -1) { + perror("sem_unlink"); + exit(EXIT_FAILURE); + } + } + inline void Wait(void) { + sem_wait(semPtr); + } + inline void Post(void) { + sem_post(semPtr); + } + #else + private: + sem_t sem; + public: + inline void Init(int init_val) { + if (sem_init(&sem, 0, init_val) != 0) { + utils::Error("Semaphore.Init:%s", strerror(errno)); + } + } + inline void Destroy(void) { + if (sem_destroy(&sem) != 0) { + utils::Error("Semaphore.Destroy:%s", strerror(errno)); + } + } + inline void Wait(void) { + if (sem_wait(&sem) != 0) { + utils::Error("Semaphore.Wait:%s", strerror(errno)); + } + } + inline void Post(void) { + if (sem_post(&sem) != 0) { + utils::Error("Semaphore.Post:%s", strerror(errno)); + } + } + #endif +}; + +// mutex that works with pthread +class Mutex { + public: + inline void Init(void) { + pthread_mutex_init(&mutex, NULL); + } + inline void Lock(void) { + pthread_mutex_lock(&mutex); + } + inline void Unlock(void) { + pthread_mutex_unlock(&mutex); + } + inline void Destroy(void) { + pthread_mutex_destroy(&mutex); + } + + private: + friend class ConditionVariable; + pthread_mutex_t mutex; +}; + +// conditional variable that uses pthread +class ConditionVariable { + public: + // initialize conditional variable + inline void Init(void) { + pthread_cond_init(&cond, NULL); + } + // destroy the thread + inline void Destroy(void) { + pthread_cond_destroy(&cond); + } + // wait on the conditional variable + inline void Wait(Mutex *mutex) { + pthread_cond_wait(&cond, &(mutex->mutex)); + } + inline void Broadcast(void) { + pthread_cond_broadcast(&cond); + } + inline void Signal(void) { + pthread_cond_signal(&cond); + } + + private: + pthread_cond_t cond; +}; + +/*!\brief simple thread class */ +class Thread { + private: + pthread_t thread; + public : + inline void Start(void * entry(void*), void *param) { + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + pthread_create(&thread, &attr, entry, param); + } + inline int Join(void) { + void *status; + return pthread_join(thread, &status); + } +}; +inline void ThreadExit(void *status) { + pthread_exit(status); +} +} // namespace utils +} // namespace mshadow +#define MSHADOW_THREAD_PREFIX void * +#endif // Linux +#endif // MSHADOW_PS_THREAD_H_ diff --git a/mshadow-ps/thread_util.h b/mshadow-ps/thread_util.h new file mode 100644 index 000000000000..607d69f83c3a --- /dev/null +++ b/mshadow-ps/thread_util.h @@ -0,0 +1,143 @@ +#ifndef MSHADOW_PS_THREAD_UTIL_H_ +#define MSHADOW_PS_THREAD_UTIL_H_ +/*! + * \file thread_util.h + * \brief data structures for multi-threading communication + * \author Tianqi Chen + */ +#include +#include +#include "./thread.h" +namespace mshadow { +namespace utils { +/*! + * \brief thread safe queue that can be used for customer consumer model + * in the future, it will support priority scheduling + * \tparam DType the content of the queue + */ +template +class ThreadPQueue { + public: + /*! \brief intitialize the queue, must call this before use */ + inline void Init(void) { + lock_.Init(); + counter_.Init(0); + } + /*! \brief destroy the resources on the queue */ + inline void Destroy(void) { + lock_.Destroy(); + counter_.Destroy(); + } + /*! + * \brief Destroy the queue + * wake up all the threads waits on pop + * this is usually used in class destructor + * \param max_nthread the maximum number of thread that + * could be waiting on the queue + */ + inline void Abort(int max_nthread = 1) { + for (int i = 0; i < max_nthread; ++ i) { + counter_.Post(); + } + } + /*! + * \brief push an element to the queue + * \param data the data to be puhed into queue + * \param optionally priority level to hint which + * element should be poped first + */ + inline void Push(const DType &data, int priority = 0) { + lock_.Lock(); + queue_.push(Entry(data, priority)); + lock_.Unlock(); + counter_.Post(); + } + /*! + * \brief pop an element from the queue + * this will block the thread if the queue is empty + * \param data_out the address to put output of the queue + * \return true if a correct element is returned + * false if abort is called and no element was left in queue + */ + inline bool Pop(DType *data_out) { + counter_.Wait(); + lock_.Lock(); + if (queue_.size() == 0) { + lock_.Unlock(); return false; + } + utils::Assert(queue_.size() != 0, "Queue.Pop"); + *data_out = queue_.top().data; + queue_.pop(); + lock_.Unlock(); + return true; + } + + private: + // entry in the queue + struct Entry { + DType data; + int priority; + Entry(const DType &data, int priority) + : data(data), priority(priority) {} + inline bool operator<(const Entry &b) const { + return priority < b.priority; + } + }; + + // the queue to push + std::priority_queue queue_; + // lock for accessing the queue + utils::Mutex lock_; + // counter to count number of push tasks + utils::Semaphore counter_; +}; + +// naive implementation of threadsafe map +template +class ThreadSafeMap { + public: + inline void Init(void) { + lock_.Init(); + } + inline void Destroy(void) { + for (typename std::map::iterator + it = map_.begin(); it != map_.end(); ++it) { + delete it->second; + } + lock_.Destroy(); + } + inline TValue *Get(int key) { + TValue *ret; + lock_.Lock(); + typename std::map::const_iterator + it = map_.find(key); + if (it == map_.end() || it->first != key) { + ret = NULL; + } else { + ret = it->second; + } + lock_.Unlock(); + return ret; + } + inline TValue &GetRef(int key) { + TValue *ret = this->Get(key); + utils::Assert(ret != NULL, "key does not exist"); + return *ret; + } + inline void Init(int key) { + lock_.Lock(); + if (map_.count(key) == 0) { + map_[key] = new TValue(); + } + lock_.Unlock(); + } + + private: + // lock for accessing the queue + utils::Mutex lock_; + std::map map_; +}; + +} // namespace utils +} // namespace mshadow +#endif // MSHADOW_PS_THREAD_UTIL_H_ diff --git a/mshadow/README.md b/mshadow/README.md new file mode 100644 index 000000000000..86276af013e2 --- /dev/null +++ b/mshadow/README.md @@ -0,0 +1,8 @@ +Code Guide +==== +This readme contains notes about code in mshadow. MShadow generally follows Google's C++ Style. + +Convention +==== +* Basically, all the files ends in ```-inl.h, -inl.cuh``` are implementations, and can be ignored if only using mshadow +* The files ends in ```.h``` are heavily commented with [doxyen format](http://www.doxygen.org/), and can be used to generate the corresponding document. diff --git a/mshadow/base.h b/mshadow/base.h new file mode 100644 index 000000000000..6336dfa023bc --- /dev/null +++ b/mshadow/base.h @@ -0,0 +1,359 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file base.h + * \brief definitions of base types, operators, macros functions + * + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_BASE_H_ +#define MSHADOW_BASE_H_ +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#define NOMINMAX +#endif +#include +#include +#include +#include +#include +// macro defintiions +/*! + * \brief if this macro is define to be 1, + * mshadow should compile without any of other libs + */ +#ifndef MSHADOW_STAND_ALONE +#define MSHADOW_STAND_ALONE 0 +#endif +/*! \brief whether do padding during allocation */ +#ifndef MSHADOW_ALLOC_PAD +#define MSHADOW_ALLOC_PAD true +#endif +/*! + * \brief + * x dimension of data must be bigger pad_size * ratio to be alloced padded memory, + * otherwise use tide allocation + * for example, if pad_ratio=2, GPU memory alignement size is 32, + * then we will only allocate padded memory if x dimension > 64 + * set it to 0 then we will always allocate padded memory + */ +#ifndef MSHADOW_MIN_PAD_RATIO + #define MSHADOW_MIN_PAD_RATIO 2 +#endif + +#if MSHADOW_STAND_ALONE + #define MSHADOW_USE_CBLAS 0 + #define MSHADOW_USE_MKL 0 + #define MSHADOW_USE_CUDA 0 +#endif + +/*! + * \brief force user to use GPU stream during computation + * error will be shot when default stream NULL is used + */ +#ifndef MSHADOW_FORCE_STREAM +#define MSHADOW_FORCE_STREAM 0 +#endif + +/*! \brief use CBLAS for CBLAS */ +#ifndef MSHADOW_USE_CBLAS + #define MSHADOW_USE_CBLAS 0 +#endif +/*! \brief use MKL for BLAS */ +#ifndef MSHADOW_USE_MKL + #define MSHADOW_USE_MKL 1 +#endif +/*! + * \brief use CUDA support, must ensure that the cuda include path is correct, + * or directly compile using nvcc + */ +#ifndef MSHADOW_USE_CUDA + #define MSHADOW_USE_CUDA 1 +#endif + +/*! + * \brief seems CUDAARCH is deprecated in future NVCC + * set this to 1 if you want to use CUDA version smaller than 2.0 + */ +#ifndef MSHADOW_OLD_CUDA +#define MSHADOW_OLD_CUDA 0 +#endif + +/*! \brief whether use SSE */ +#ifndef MSHADOW_USE_SSE + #define MSHADOW_USE_SSE 1 +#endif +/*! \brief whether use NVML to get dynamic info */ +#ifndef MSHADOW_USE_NVML + #define MSHADOW_USE_NVML 0 +#endif +// SSE is conflict with cudacc +#ifdef __CUDACC__ + #undef MSHADOW_USE_SSE + #define MSHADOW_USE_SSE 0 +#endif + +#if MSHADOW_USE_CBLAS +extern "C" { + #include +} +#elif MSHADOW_USE_MKL + #include + #include + #include + #include +#endif + +#if MSHADOW_USE_CUDA + #include + #include +#endif + +#if MSHADOW_USE_NVML + #include +#endif +// -------------------------------- +// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code +#ifdef MSHADOW_XINLINE + #error "MSHADOW_XINLINE must not be defined" +#endif +#ifdef _MSC_VER +#define MSHADOW_FORCE_INLINE __forceinline +#pragma warning( disable : 4068 ) +#else +#define MSHADOW_FORCE_INLINE inline __attribute__((always_inline)) +#endif +#ifdef __CUDACC__ + #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE __device__ __host__ +#else + #define MSHADOW_XINLINE MSHADOW_FORCE_INLINE +#endif +/*! \brief cpu force inline */ +#define MSHADOW_CINLINE MSHADOW_FORCE_INLINE + +#if defined(__GXX_EXPERIMENTAL_CXX0X) ||\ + defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L + #define MSHADOW_CONSTEXPR constexpr +#else + #define MSHADOW_CONSTEXPR const +#endif + +/*! + * \brief default data type for tensor string + * in code release, change it to default_real_t + * during development, change it to empty string so that missing + * template arguments can be detected + */ +#ifndef MSHADOW_DEFAULT_DTYPE +#define MSHADOW_DEFAULT_DTYPE = default_real_t +//#define MSHADOW_DEFAULT_DTYPE +#endif + +/*! \brief namespace for mshadow */ +namespace mshadow { +/*! \brief buffer size for each random number generator */ +const unsigned kRandBufferSize = 1000000; +/*! \brief pi */ +const float kPi = 3.1415926f; +/*! \brief type that will be used for index */ +typedef unsigned index_t; +/*! \brief float point type that will be used in default by mshadow */ +typedef float default_real_t; + +/*! \brief namespace for operators */ +namespace op { +// binary operator +/*! \brief mul operator */ +struct mul{ + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a * b; + } +}; +/*! \brief plus operator */ +struct plus { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a + b; + } +}; +/*! \brief minus operator */ +struct minus { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a - b; + } +}; +/*! \brief divide operator */ +struct div { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return a / b; + } +}; +/*! \brief get rhs */ +struct right { + /*! \brief map a, b to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a, DType b) { + return b; + } +}; +// unary operator/ function: example +// these operators can be defined by user, +// in the same style as binary and unary operator +// to use, simply write F( src ) +/*! \brief identity function that maps a real number to it self */ +struct identity{ + /*! \brief map a to result using defined operation */ + template + MSHADOW_XINLINE static DType Map(DType a) { + return a; + } +}; +} // namespace op +/*! \brief namespace for savers */ +namespace sv { +/*! \brief save to saver: = */ +struct saveto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a = b; + } + /*! \brief helper constant to use BLAS, alpha */ + inline static default_real_t AlphaBLAS(void) { return 1.0f; } + /*! \brief helper constant to use BLAS, beta */ + inline static default_real_t BetaBLAS(void) { return 0.0f; } + /*! \brief corresponding binary operator type */ + typedef op::right OPType; +}; +/*! \brief save to saver: += */ +struct plusto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a += b; + } + /*! \brief helper constant to use BLAS, alpha */ + inline static default_real_t AlphaBLAS(void) { return 1.0f; } + /*! \brief helper constant to use BLAS, beta */ + inline static default_real_t BetaBLAS(void) { return 1.0f; } + /*! \brief corresponding binary operator type */ + typedef op::plus OPType; +}; +/*! \brief minus to saver: -= */ +struct minusto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a -= b; + } + /*! \brief helper constant to use BLAS, alpha */ + inline static default_real_t AlphaBLAS(void) { return -1.0f; } + /*! \brief helper constant to use BLAS, beta */ + inline static default_real_t BetaBLAS(void) { return 1.0f; } + /*! \brief corresponding binary operator type */ + typedef op::minus OPType; +}; +/*! \brief multiply to saver: *= */ +struct multo { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType &a, DType b) { + a *= b; + } + /*! \brief corresponding binary operator type */ + typedef op::mul OPType; +}; +/*! \brief divide to saver: /= */ +struct divto { + /*! \brief save b to a using save method */ + template + MSHADOW_XINLINE static void Save(DType& a, DType b) { + a /= b; + } + /*! \brief corresponding binary operator type */ + typedef op::div OPType; +}; +} // namespace sv +/*! \brief namespace for potential reducer operations */ +namespace red { +namespace limits { +/*! + * \brief minimum value of certain types + * \tparam DType data type + */ +template +MSHADOW_XINLINE DType MinValue(void); +/*! \brief minimum value of float */ +template<> +MSHADOW_XINLINE float MinValue(void) { + return -FLT_MAX; +} +/*! \brief minimum value of double */ +template<> +MSHADOW_XINLINE double MinValue(void) { + return -DBL_MAX; +} +/*! \brief minimum value of int */ +template<> +MSHADOW_XINLINE int MinValue(void) { + return INT_MIN; +} +} // namespace limits + +/*! \brief sum reducer */ +struct sum { + /*! \brief do reduction into dst */ + template + MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { + dst += src; + } + /*! + *\brief calculate gradient of redres with respect to redsrc, + * redres: reduced result, redsrc: one of reduction element + */ + template + MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) { + return 1; + } + /*! + *\brief set the initial value during reduction + */ + template + MSHADOW_XINLINE static void SetInitValue(DType &initv) { + initv = 0; + } +}; +/*! \brief maximum reducer */ +struct maximum { + /*! \brief do reduction into dst */ + template + MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { + using namespace std; + dst = max(dst, src); + } + /*! + * \brief calculate gradient of redres with respect to redsrc, + * redres: reduced result, redsrc: one of reduction element + */ + template + MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) { + return redres == redsrc ? 1: 0; + } + /*! + *\brief set the initial value during reduction + */ + template + MSHADOW_XINLINE static void SetInitValue(DType &initv) { + initv = limits::MinValue(); + } +}; +} // namespace red +} // namespace mshadow +#endif // MSHADOW_BASE_H_ diff --git a/mshadow/cuda/cuda_reduce.cuh b/mshadow/cuda/cuda_reduce.cuh deleted file mode 100644 index b7808a6ffa30..000000000000 --- a/mshadow/cuda/cuda_reduce.cuh +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef MSHADOW_CUDA_REDUCE_CUH -#define MSHADOW_CUDA_REDUCE_CUH -/*! - * \file cuda_reduce.cuh - * \brief helper functions to do reduction - * \author Tianqi Chen - */ -namespace mshadow{ - namespace cuda{ - /* - * \brief reduce over the dimension x - * \tparam Reducer reducer - * \tparam x_bits dimension = 1< - inline __device__ void Reduce1D( volatile real_t buf[1< - inline __device__ void Reduce1DNotAlign( volatile real_t buf[1< - inline __device__ void ReduceX( volatile real_t buf[], int tid ){ - if( x_bits >= 10 ){ - if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] ); - __syncthreads(); - } - if( x_bits >= 9 ){ - if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] ); - __syncthreads(); - } - if( x_bits >= 8 ){ - if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] ); - __syncthreads(); - } - if( x_bits >= 7 ){ - if( tid < 64 ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] ); - __syncthreads(); - } - if( x_bits >= 6 ){ - if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] ); - __syncthreads(); - } - // in warp optimization - if( x_bits >= 5 ){ - if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 4 ){ - if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 3 ){ - if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 2 ){ - if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] ); - __MSHADOW_EMUSYNC__; - } - if( x_bits >= 1 ){ - if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] ); - __MSHADOW_EMUSYNC__; - } - }; - - template - inline __device__ void Reduce1D( volatile real_t buf[1<( buf, threadIdx.x ); - } - - // reduce with a upper bound - #define __RD_NON_ALIGN(els,x_bits) \ - els \ - if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){ \ - if( tid < (1 << x_bits) && tid + (1<( buf, tid ); \ - } \ - - template - inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){ - int tid = threadIdx.x; - __RD_NON_ALIGN(, 8) - __RD_NON_ALIGN(else, 7) - __RD_NON_ALIGN(else, 6) - __RD_NON_ALIGN(else, 5) - __RD_NON_ALIGN(else, 4) - __RD_NON_ALIGN(else, 3) - __RD_NON_ALIGN(else, 2) - __RD_NON_ALIGN(else, 1) - } - }; -}; - -#endif // MSHADOW_CUDA_REDUCE_CUH - diff --git a/mshadow/cuda/reduce.cuh b/mshadow/cuda/reduce.cuh new file mode 100644 index 000000000000..8fa0cf1dc061 --- /dev/null +++ b/mshadow/cuda/reduce.cuh @@ -0,0 +1,118 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file reduce.cuh + * \brief helper functions to do reduction + * \author Tianqi Chen + */ +#ifndef MSHADOW_CUDA_REDUCE_CUH_ +#define MSHADOW_CUDA_REDUCE_CUH_ + +namespace mshadow { +namespace cuda { +/* + * \brief reduce over the dimension x + * \tparam Reducer reducer + * \tparam x_bits dimension = 1< +inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]); +/* + * \brief reduce over the dimension x + * \tparam Reducer reducer + * \tparam xmax_bits maximum size of buffer + * \tparam DType content data type + * \param xsize size of x dimension, not sure if aligned + */ +template +inline __device__ void +Reduce1DNotAlign(volatile DType buf[1 << xmax_bits], int xsize); +// ===============================================x=== +// implementations afterwards, +// no need to read if only use the functions +// -------------------------------------------------- +#ifdef __DEVICE_EMULATION__ +#define __MSHADOW_EMUSYNC__ __syncthreads() +#else +#define __MSHADOW_EMUSYNC__ +#endif + +template +inline __device__ void ReduceX(volatile DType buf[], int tid) { + if (x_bits >= 10) { + if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]); + __syncthreads(); + } + if (x_bits >= 9) { + if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]); + __syncthreads(); + } + if (x_bits >= 8) { + if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]); + __syncthreads(); + } + if (x_bits >= 7) { + if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]); + __syncthreads(); + } + if (x_bits >= 6) { + if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]); + __syncthreads(); + } + // in warp optimization + if (x_bits >= 5) { + if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); +#if MSHADOW_OLD_CUDA + __syncthreads(); +#else + __MSHADOW_EMUSYNC__; +#endif + } + if (x_bits >= 4) { + if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); + __MSHADOW_EMUSYNC__; + } + if (x_bits >= 3) { + if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]); + __MSHADOW_EMUSYNC__; + } + if (x_bits >= 2) { + if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]); + __MSHADOW_EMUSYNC__; + } + if (x_bits >= 1) { + if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]); + __MSHADOW_EMUSYNC__; + } +} +template +inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]) { + ReduceX(buf, threadIdx.x); +} +// reduce with a upper bound +#define __RD_NON_ALIGN(els, x_bits) \ + els \ + if (xmax_bits >= x_bits && x_size >= (1 << x_bits)) { \ + if (tid < (1 << x_bits) && tid + (1 << x_bits) < x_size) { \ + Reducer::Reduce(buf[tid] , buf[tid + (1 << x_bits)]); \ + } \ + __syncthreads(); \ + ReduceX(buf, tid); \ + } \ + +template +inline __device__ void Reduce1DNotAlign(volatile DType buf[], int x_size) { + int tid = threadIdx.x; + __RD_NON_ALIGN(, 8) + __RD_NON_ALIGN(else, 7) + __RD_NON_ALIGN(else, 6) + __RD_NON_ALIGN(else, 5) + __RD_NON_ALIGN(else, 4) + __RD_NON_ALIGN(else, 3) + __RD_NON_ALIGN(else, 2) + __RD_NON_ALIGN(else, 1) +} +} // namespace cuda +} // namespace mshadow +#endif // MSHADOW_CUDA_REDUCE_CUH_ + diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 61e477cf531b..a65add5237a7 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -1,231 +1,253 @@ -#ifndef MSHADOW_TENSOR_GPU_INL_CUH -#define MSHADOW_TENSOR_GPU_INL_CUH /*! + * Copyright (c) 2014 by Contributors * \file tensor_gpu-inl.cuh * \brief implementation of GPU code using CUDA * \author Bing Xu, Tianqi Chen */ +#ifndef MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ +#define MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ #include "../tensor.h" -#include "cuda_reduce.cuh" +#include "./reduce.cuh" -namespace mshadow{ - namespace cuda{ - #ifndef __CUDA_ARCH__ - #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" - #endif - /* load unit for memory access */ - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 - const int kMemUnitBits = 5; - const int kMaxThreadsPerBlock = 1024; - #else - const int kMemUnitBits = 4; - const int kMaxThreadsPerBlock = 512; - #endif - /*! \brief number of units that can do synchronized update, half warp size */ - const int kMemUnit = 1 << kMemUnitBits; - /*! \brief mask that could be helpful sometime */ - const int kMemUnitMask = kMemUnit - 1; - /*! \brief suggested thread number(logscale) for mapping kernel */ - const int kBaseThreadBits = 8; - /*! \brief suggested thread number for mapping kernel */ - const int kBaseThreadNum = 1 << kBaseThreadBits; - /*! \brief maximum value of grid */ - const int kMaxGridNum = 65535; - /*! \brief suggested grid number for mapping kernel */ - const int kBaseGridNum = 1024; - - /*! \brief get align stride for given size in x dimension */ - inline index_t GetAlignStride( index_t xsize, index_t xstride ){ - if( (xstride & (kMemUnit-1)) == 0 ){ - return ( (xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; - }else{ - // if originally space is not aligned, no necessary to to alligned thread allocation - return xsize; - } - } - inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){ - if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock || - dimGrid.x > 65535 || dimGrid.y > 65535 ){ - fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z ); - utils::Error( "too large launch parameter\n"); - } - } - }; +namespace mshadow { +namespace cuda { +/* load unit for memory access, if CUDAARCH not defined, this is advanced nvcc */ +#if MSHADOW_OLD_CUDA +const int kMemUnitBits = 4; +const int kMaxThreadsPerBlock = 512; +#else +const int kMemUnitBits = 5; +const int kMaxThreadsPerBlock = 1024; +#endif +/*! \brief number of units that can do synchronized update, half warp size */ +const int kMemUnit = 1 << kMemUnitBits; +/*! \brief mask that could be helpful sometime */ +const int kMemUnitMask = kMemUnit - 1; +/*! \brief suggested thread number(logscale) for mapping kernel */ +const int kBaseThreadBits = 8; +/*! \brief suggested thread number for mapping kernel */ +const int kBaseThreadNum = 1 << kBaseThreadBits; +/*! \brief maximum value of grid */ +const int kMaxGridNum = 65535; +/*! \brief suggested grid number for mapping kernel */ +const int kBaseGridNum = 1024; +/*! \brief get align stride for given size in x dimension */ +inline index_t GetAlignStride(index_t xsize) { + if (xsize >= MSHADOW_MIN_PAD_RATIO * 32) { + return ((xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; + } else { + // if originally space is not aligned, no necessary to to alligned thread allocation + return xsize; + } +} +inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "") { + if (dimBlock.x * dimBlock.y * dimBlock.z > static_cast(kMaxThreadsPerBlock) || + dimGrid.x > 65535 || dimGrid.y > 65535) { + fprintf(stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z); + utils::Error("too large launch parameter\n"); + } +} +template +__device__ void MapPlanProc(DstPlan dst, index_t xstride, + Shape<2> dshape, const Plan exp, int block_idx) { + const index_t tid = (block_idx << block_dim_bits) + threadIdx.x; + const int y = tid / xstride; + const int x = tid % xstride; + if (y < dshape[0] && x < dshape[1]) { + Saver::Save(dst.REval(y, x), exp.Eval(y,x)); + } +} +template +__global__ void MapPlanKernel(DstPlan dst, index_t xstride, + Shape<2> dshape, const Plan exp) { + MapPlanProc + (dst, xstride, dshape, exp, blockIdx.x); +} +template +__global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride, + Shape<2> dshape, const Plan exp, int repeat) { + for (int i = 0; i < repeat; ++i) { + MapPlanProc + (dst, xstride, dshape, exp, blockIdx.x + i * grid_size); + } +} - namespace cuda { - template - __device__ void MapPlanProc( Tensor dst, const index_t xstride, const Plan exp, int block_idx ){ - const index_t tid = (block_idx << block_dim_bits) + threadIdx.x; - const int y = tid / xstride; - const int x = tid % xstride; - if (y < dst.shape[1] && x < dst.shape[0]) { - Saver::Save(dst[y][x], exp.Eval(y,x)); - } - } - template - __global__ void MapPlanKernel( Tensor dst, const index_t xstride, const Plan exp ){ - MapPlanProc( dst, xstride, exp, blockIdx.x ); - } - template - __global__ void MapPlanLargeKernel( Tensor dst, const index_t xstride, const Plan exp, int repeat ){ - for( int i = 0; i < repeat; ++i ){ - MapPlanProc( dst, xstride, exp, blockIdx.x + i*grid_size ); - } - } - - template - inline void MapPlan( Tensor dst, const expr::Plan &plan ){ - const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ ); - const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum; - dim3 dimBlock(kBaseThreadNum, 1, 1); +template +inline void MapPlan(expr::Plan dst, + const expr::Plan &plan, + Shape<2> dshape, + cudaStream_t stream) { + const index_t xstride = GetAlignStride(dshape[1]); + const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum; + dim3 dimBlock(kBaseThreadNum, 1, 1); + + if (num_block < kMaxGridNum) { + dim3 dimGrid(num_block, 1, 1); + MapPlanKernel, + expr::Plan > + <<>>(dst, xstride, dshape, plan); + } else { + int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; + dim3 dimGrid(kBaseGridNum, 1 , 1); + MapPlanLargeKernel, + expr::Plan > + <<>>(dst, xstride, dshape, plan, repeat); + } +} - if (num_block < kMaxGridNum) { - dim3 dimGrid(num_block, 1, 1); - MapPlanKernel, kBaseThreadBits> \ - <<>>(dst, xstride, plan); - } else { - int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; - dim3 dimGrid( kBaseGridNum, 1 , 1 ); - MapPlanLargeKernel, kBaseThreadBits, kBaseGridNum> \ - <<>>(dst, xstride, plan, repeat ); - } - } - }; // namespace cuda - - namespace cuda{ - template - __global__ void MapRedKeepLowestKernel( Tensor dst, Plan plan, real_t scale, Shape<2> eshape ){ - const unsigned warp_size = 1 << warp_bits; - const unsigned x = (blockIdx.x< +__global__ void MapRedKeepLowestKernel(DstPlan dst, Plan plan, + DType scale, Shape<2> eshape) { + const unsigned warp_size = 1 << warp_bits; + const unsigned x = (blockIdx.x << warp_bits) + threadIdx.x; + // to avoid bank conflict + __shared__ DType s_res[warp_size][warp_size + 1]; + // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization + if (threadIdx.y < eshape[0] && x < eshape[1]) { + s_res[threadIdx.x][threadIdx.y] = plan.Eval(threadIdx.y, x); + } + for (unsigned y = warp_size; y < eshape[0]; y += warp_size) { + if (threadIdx.y + y < eshape[0] && x < eshape[1]) { + Reducer::Reduce(s_res[threadIdx.x][threadIdx.y], plan.Eval(threadIdx.y + y, x)); + } + } + __syncthreads(); + if (eshape[0] >= warp_size) { + Reduce1D(s_res[threadIdx.y]); + } else { + Reduce1DNotAlign(s_res[threadIdx.y], eshape[0]); + } + __syncthreads(); - // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization - if( threadIdx.y < eshape[1] && x < eshape[0] ){ - s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x ); - } - for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){ - if( threadIdx.y + y < eshape[1] && x < eshape[0] ){ - Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) ); - } - } - __syncthreads(); - if( eshape[1] >= warp_size ){ - Reduce1D( s_res[ threadIdx.y ] ); - }else{ - Reduce1DNotAlign( s_res[ threadIdx.y ], eshape[1] ); - } - __syncthreads(); - - if( threadIdx.y == 0 && x < eshape[0] ){ - Saver::Save( dst[x], s_res[ threadIdx.x ][ 0 ] * scale ); - } - } - - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Plan &plan, real_t scale, Shape<2> eshape ){ - dim3 dimBlock( kMemUnit, kMemUnit ); - dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits ); - CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" ); - MapRedKeepLowestKernel<<>>( dst, plan, scale, eshape ); - } - }; // namespace cuda - - namespace cuda{ - template - __global__ void MapReduceKeepDim2Kernel( Tensor dst, Plan plan, real_t scale, Shape<4> pshape ){ - const int block_size = 1 << block_dim_bits; - __shared__ real_t s_rec[ block_size ]; - const int c = blockIdx.x; - const index_t tot = pshape[0]*pshape[1]*pshape[3]; + if (threadIdx.y == 0 && x < eshape[1]) { + Saver::Save(dst.REval(0, x), s_res[threadIdx.x][0] * scale); + } +} - real_t res = Reducer::kInitV; - for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){ - index_t i = i_offset + threadIdx.x; - if( i< tot ){ - const index_t x = i % pshape[0]; - i /= pshape[0]; - const index_t y = i % pshape[1]; - const index_t n = i / pshape[1]; - Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) ); - } - } - s_rec[ threadIdx.x ] = res; - __syncthreads(); - Reduce1D( s_rec ); - if( threadIdx.x == 0 ){ - Saver::Save( dst[c], s_rec[0]*scale ); - } - } +template +inline void MapReduceKeepLowest(expr::Plan dst, + const expr::Plan &plan, + DType scale, Shape<2> eshape, + cudaStream_t stream) { + dim3 dimBlock(kMemUnit, kMemUnit); + dim3 dimGrid((eshape[1] + kMemUnit - 1) >> kMemUnitBits); + CheckLaunchParam(dimGrid, dimBlock, "MapRedKeepLowestKernel"); + MapRedKeepLowestKernel, + expr::Plan > + <<>>(dst, plan, scale, eshape); +} - template - inline void MapReduceKeepDim2( Tensor dst, const Plan &plan, real_t scale, Shape<4> pshape ){ - dim3 dimBlock( kBaseThreadNum ); - dim3 dimGrid ( dst.shape[0] ); - CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" ); - MapReduceKeepDim2Kernel - <<>>( dst, plan, scale, pshape ); - } - }; - - namespace cuda{ - template - __global__ void SoftmaxKernel( Tensor dst, Tensor src ){ - const unsigned x_size = 1 << x_bits; - const int y = blockIdx.x; - __shared__ real_t s_rec[ x_size ]; - - // step 1: get max - if( threadIdx.x < dst.shape[ 0 ] ){ - s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; - } - for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){ - if( x + threadIdx.x < dst.shape[0] ){ - real_t a = src[ y ][ x + threadIdx.x ]; - s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] ); - } - } - __syncthreads(); - if( threadIdx.x >= dst.shape[0] ){ - s_rec[ threadIdx.x ] = s_rec[0]; - } - __syncthreads(); - Reduce1D( s_rec ); - __syncthreads(); - real_t smax = s_rec[0]; - __syncthreads(); - s_rec[ threadIdx.x ] = 0.0f; - __syncthreads(); +template +__global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Shape<4> pshape) { + const int block_size = 1 << block_dim_bits; + __shared__ DType s_rec[block_size]; + const int c = blockIdx.x; + const index_t tot = pshape[3] * pshape[2] * pshape[0]; + + DType res; Reducer::SetInitValue(res); + for (index_t i_offset = 0; i_offset < tot; i_offset += block_size) { + index_t i = i_offset + threadIdx.x; + if (i< tot) { + const index_t x = i % pshape[3]; + i /= pshape[3]; + const index_t y = i % pshape[2]; + const index_t n = i / pshape[2]; + Reducer::Reduce(res, plan.Eval((n * pshape[1] + c) * pshape[2] + y, x)); + } + } + s_rec[threadIdx.x] = res; + __syncthreads(); + Reduce1D(s_rec); + if (threadIdx.x == 0) { + Saver::Save(dst.REval(0, c), s_rec[0] * scale); + } +} - // calculate normalizer, with writeback - for( unsigned x = 0; x < dst.shape[0]; x += x_size ){ - if( x + threadIdx.x < dst.shape[0] ){ - real_t p = expf( src[ y ][ x + threadIdx.x ] - smax ); - s_rec[ threadIdx.x ] += p; - // write back first, will fetch later - dst[ y ][ x + threadIdx.x ] = p; - } - } - // calculate normalizer - __syncthreads(); - Reduce1D( s_rec ); - __syncthreads(); - real_t ssum = s_rec[0]; +template +inline void MapReduceKeepDim1(expr::Plan dst, + const expr::Plan &plan, + DType scale, Shape<4> pshape, + cudaStream_t stream) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid (pshape[1]); + CheckLaunchParam(dimGrid, dimBlock, "MapReduceKeepDim1"); + MapReduceKeepDim1Kernel, + expr::Plan > + <<>>(dst, plan, scale, pshape); +} - for( unsigned x = 0; x < dst.shape[0]; x += x_size ){ - if( x + threadIdx.x < dst.shape[0] ){ - dst[ y ][ x + threadIdx.x ] /= ssum; - } - } - } - - inline void Softmax( Tensor &dst, const Tensor &src ){ - dim3 dimBlock( kBaseThreadNum ); - dim3 dimGrid ( dst.shape[1] ); - utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" ); - CheckLaunchParam( dimGrid, dimBlock, "Softmax" ); - SoftmaxKernel<<>>( dst, src ); - } - }; // namespace cuda -}; // namespace mshadow -#endif // TENSOR_GPU_INL_H +template +__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { + const unsigned x_size = 1 << x_bits; + const int y = blockIdx.x; + __shared__ DType s_rec[x_size]; + // step 1: get max + if (threadIdx.x < xmax) { + s_rec[threadIdx.x] = src.Eval(y, threadIdx.x); + } + for (unsigned x = x_size; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType a = src.Eval(y, x + threadIdx.x); + s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); + } + } + __syncthreads(); + if (threadIdx.x >= xmax) { + s_rec[threadIdx.x] = s_rec[0]; + } + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + DType smax = s_rec[0]; + __syncthreads(); + s_rec[threadIdx.x] = 0.0f; + __syncthreads(); + + // calculate normalizer, with writeback + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType p = expf(src.Eval(y, x + threadIdx.x) - smax); + s_rec[threadIdx.x] += p; + // write back first, will fetch later + dst.REval(y, x + threadIdx.x) = p; + } + } + // calculate normalizer + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + DType ssum = s_rec[0]; + + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + dst.REval(y, x + threadIdx.x) /= ssum; + } + } +} +template +inline void Softmax(Tensor &dst, + const Tensor &src) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid(dst.size(0)); + utils::Check(dst.shape_ == src.shape_, "Softmax: shape mismatch"); + CheckLaunchParam(dimGrid, dimBlock, "Softmax"); + cudaStream_t stream = Stream::GetStream(dst.stream_); + SoftmaxKernel + <<>> + (expr::MakePlan(dst), + expr::MakePlan(src), + dst.size(1)); +} +} // namespace cuda +} // namespace mshadow +#endif // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h new file mode 100644 index 000000000000..168441657baa --- /dev/null +++ b/mshadow/dot_engine-inl.h @@ -0,0 +1,200 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file dot_engine-inl.h + * \brief definitions of how Matrix Multiplications can be evaluated + * \author Tianqi Chen + */ +#ifndef MSHADOW_DOT_ENGINE_INL_H_ +#define MSHADOW_DOT_ENGINE_INL_H_ +namespace mshadow { +namespace expr { +//--------------------------------------------------------------------- +// Matrix Multiplications, depends on BLAS Engine +//--------------------------------------------------------------------- +template +struct DotEngine { + inline static void Eval(Tensor *p_dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale); +}; +// handles the dot +template +struct BLASEngine; +#if (MSHADOW_USE_CBLAS || MSHADOW_USE_MKL) +template<> +struct BLASEngine { + inline static CBLAS_TRANSPOSE GetT(bool t) { + return t ? CblasTrans : CblasNoTrans; + } + inline static void SetStream(Stream *stream) { + } + inline static void gemm(bool transa, bool transb, + int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, + float beta, float *C, int ldc) { + cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), + m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + inline static void gemm(bool transa, bool transb, + int m, int n, int k, double alpha, + const double *A, int lda, const double *B, int ldb, + double beta, double *C, int ldc) { + cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), + m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + } + inline static void gemv(bool trans, int m, int n, + float alpha, const float *A, int lda, + const float *X, int incX, + float beta, float *Y, int incY) { + cblas_sgemv(CblasColMajor, GetT(trans), m, n, alpha, + A, lda, X, incX, beta, Y, incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, + const double *A, int lda, + const double *X, int incX, + double beta, double *Y, int incY) { + cblas_dgemv(CblasColMajor, GetT(trans), m, n, alpha, + A, lda, X, incX, beta, Y, incY); + } + inline static void ger(int m, int n, float alpha, + const float *X, int incX, + const float *Y, int incY, float *A, int lda) { + cblas_sger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda); + } + inline static void ger(int m, int n, double alpha, + const double *X, int incX, + const double *Y, int incY, double *A, int lda) { + cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda); + } +}; +#endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL +// CuBLAS redirect code +#if MSHADOW_USE_CUDA +// All CuBLAS goes to here, use legacy API: not threadsafe +template<> +struct BLASEngine { + inline static char GetT(bool t) { + return t ? 'T' : 'N'; + } + inline static void SetStream(Stream *stream) { + cublasSetKernelStream(Stream::GetStream(stream)); + } + inline static void gemm(bool transa, bool transb, + int m, int n, int k, float alpha, + const float *A, int lda, + const float *B, int ldb, float beta, + float *C, int ldc) { + cublasSgemm(GetT(transa), GetT(transb), m, n, k, alpha, + A, lda, B, ldb, beta, C, ldc); + } + inline static void gemm(bool transa, bool transb, + int m, int n, int k, double alpha, + const double *A, int lda, + const double *B, int ldb, + double beta, double *C, int ldc) { + cublasDgemm(GetT(transa), GetT(transb), m, n, k, alpha, + A, lda, B, ldb, beta, C, ldc); + } + inline static void gemv(bool trans, int m, int n, float alpha, + const float *A, int lda, + const float *X, int incX, float beta, + float *Y, int incY) { + cublasSgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY); + } + inline static void gemv(bool trans, int m, int n, double alpha, + const double *A, int lda, + const double *X, int incX, + double beta, double *Y, int incY) { + cublasDgemv(GetT(trans), m, n, alpha, A, lda, X, incX, beta, Y, incY); + } + inline static void ger(int m, int n, float alpha, + const float *X, int incX, + const float *Y, int incY, float *A, int lda) { + cublasSger(m, n, alpha, X, incX, Y, incY, A, lda); + } + inline static void ger(int m, int n, double alpha, + const double *X, int incX, + const double *Y, int incY, double *A, int lda) { + cublasDger(m, n, alpha, X, incX, Y, incY, A, lda); + } +}; +#endif // MSHADOW_USE_CUDA +// helper function to decide which shape we are in +inline static Shape<2> GetShape(const Shape<2> &shape, bool transpose) { + return transpose ? Shape2(shape[1], shape[0]) : shape; +} +// dst = dot(lhs[.T], rhs[.T]) +template +struct DotEngine { + inline static void Eval(Tensor *p_dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale) { + Tensor &dst = *p_dst; + // set kernel stream + BLASEngine::SetStream(dst.stream_); + Shape<2> sleft = GetShape(lhs.shape_, transpose_left); + Shape<2> sright = GetShape(rhs.shape_, transpose_right); + utils::Check(dst.size(0) == sleft[0] && dst.size(1) == sright[1] \ + && sleft[1] == sright[0] , + "dot-gemm: matrix shape mismatch"); + // use column major argument to compatible with most BLAS + BLASEngine::gemm + (transpose_right , transpose_left, + transpose_right ? rhs.size(0) : rhs.size(1), + transpose_left ? lhs.size(1) : lhs.size(0), + transpose_right ? rhs.size(1) : rhs.size(0), + scale * SV::AlphaBLAS(), + rhs.dptr_, rhs.stride_, + lhs.dptr_, lhs.stride_, + SV::BetaBLAS(), + dst.dptr_, dst.stride_); + } +}; +template +struct DotEngine { + inline static void Eval(Tensor *p_dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale) { + Tensor &dst = *p_dst; + // set kernel stream + BLASEngine::SetStream(dst.stream_); + Shape<2> sright = GetShape(rhs.shape, transpose_right); + utils::Check(dst.size(0) == sright[1] && lhs.size(0) == sright[0], + "dot-gemv: matrix shape mismatch"); + BLASEngine::gemv + (transpose_right, + rhs.size(1), rhs.size(0), scale * SV::AlphaBLAS(), + rhs.dptr_, rhs.stride_, + lhs.dptr_, 1, SV::BetaBLAS(), + dst.dptr_, 1); + } +}; +template +struct DotEngine { + inline static void Eval(Tensor *p_dst, + const Tensor &lhs, + const Tensor &rhs, + DType scale) { + Tensor &dst = *p_dst; + // set kernel stream + BLASEngine::SetStream(dst.stream_); + utils::Check(dst.size(0) == lhs.size(0) && dst.size(1) == rhs.size(0), + "dot-ger: matrix shape mismatch"); + if (SV::kBetaBLAS == 0.0f) { + BLASEngine::ger + (rhs.size(0), lhs.size(0), scale * SV::AlphaBLAS(), + rhs.dptr_, 1, lhs.dptr_, 1, dst.dptr_, dst.stride_); + } else { + DotEngine::Eval(dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale); + } + } +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_DOT_ENGINE_INL_H_ diff --git a/mshadow/expr_engine-inl.h b/mshadow/expr_engine-inl.h new file mode 100644 index 000000000000..b6ed59048a82 --- /dev/null +++ b/mshadow/expr_engine-inl.h @@ -0,0 +1,423 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file expr_engine-inl.h + * \brief definitions of how expressions should be evaluated + * \author Tianqi Chen, Bing Xu + */ +#ifndef MSHADOW_EXPR_ENGINE_INL_H_ +#define MSHADOW_EXPR_ENGINE_INL_H_ +#include +#include +#include "./utils.h" +#include "./expression.h" +#include "./tensor.h" + +namespace mshadow { +namespace expr { +/*! + * \brief a general class that allows extension that makes tensors of some shape + * \tparam SubType type of subclass + * \tparam SrcExp source expression of the MakeTensorExp, the source of operation + * \tparam dim dimension of the expression + * \tparam DType the type of elements + */ +template +struct MakeTensorExp + : public Exp, + DType, type::kChainer> { + /*! \brief the shape of this expression */ + Shape shape_; + /*! \brief true self of subtype */ + inline const SubType& real_self(void) const{ + return *static_cast(this); + } +}; +//---------------------------------------------------------------------- +// This part of code gives plan that can be used to carry out execution +//--------------------------------------------------------------------- +// Declarations of plans +template +class Plan { + public: + /*! + * \brief evaluate the expression at index [y][x] + * to be implemented by SubType, for RValue, the return type will be DType & + */ + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const; +}; +// tensor plan +template +class Plan, DType> { + public: + explicit Plan(const Tensor &t) + : dptr_(t.dptr_), stride_(t.stride_) {} + // for RValue, the return type should be reference + MSHADOW_XINLINE DType &REval(index_t y, index_t x) { + return dptr_[y * stride_ + x]; + } + // const evaluation + MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const { + return dptr_[y * stride_ + x]; + } + + private: + DType *dptr_; + index_t stride_; +}; +// special evaluation case for 1d tensor, no stride +template +class Plan, DType> { + public: + explicit Plan(const Tensor &t) : dptr_(t.dptr_) {} + MSHADOW_XINLINE DType &REval(index_t y, index_t x) { + return dptr_[x]; + } + MSHADOW_XINLINE const DType &Eval(index_t y, index_t x) const { + return dptr_[x]; + } + + private: + DType *dptr_; +}; +// scalar +template +class Plan, DType> { + public: + explicit Plan(DType scalar) : scalar_(scalar) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return scalar_; + } + + private: + DType scalar_; +}; +// unary expression +template +class Plan, DstDType> { + public: + explicit Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DstDType Eval(index_t y, index_t x) const { + return static_cast(src_.Eval(y, x)); + } + + private: + Plan src_; +}; +// binary expression +template +class Plan, DType> { + public: + explicit Plan(const Plan &lhs, const Plan &rhs) + : lhs_(lhs), rhs_(rhs) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); + } + + private: + Plan lhs_; + Plan rhs_; +}; +// unary expression +template +class Plan, DType> { + public: + explicit Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(src_.Eval(y, x)); + } + + private: + Plan src_; +}; +// remaps map tensor expression to subtype's plan +template +struct Plan, DType> { + public: + Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(y, x); + } + + private: + Plan src_; +}; +// tranpsoe +template +class Plan, DType> { + public: + explicit Plan(const Plan &src) : src_(src) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(x, y); + } + + private: + Plan src_; +}; +//---------------------------------------------------------------------- +// Mappings from expression to plans +//--------------------------------------------------------------------- +template +inline Plan, DType> +MakePlan(const BinaryMapExp &e); + +template +inline Plan, DType> MakePlan(const ScalarExp &e) { + return Plan, DType>(e.scalar_); +} + +template +inline Plan, DstDType> +MakePlan(const TypecastExp &e) { + return Plan, DstDType>(MakePlan(e.exp)); +} + +template +inline Plan MakePlan(const RValueExp &e) { + return Plan(e.self()); +} + +template +inline Plan, DType> +MakePlan(const TransposeExp &e) { + return Plan, DType>(MakePlan(e.exp)); +} + +template +inline Plan +MakePlan(const MakeTensorExp &e) { + return Plan(e.real_self()); +} + +template +inline Plan, DType> +MakePlan(const UnaryMapExp &e) { + return Plan, DType>(MakePlan(e.src_)); +} + +template +inline Plan, DType> +MakePlan(const BinaryMapExp &e) { + return Plan, + DType>(MakePlan(e.lhs_), MakePlan(e.rhs_)); +} +//---------------------------------------------------------------- +// Static Type inference and Type Checking +//---------------------------------------------------------------- +/*! + * \brief static type inference template, + * used to get the dimension of each expression, + * if ExpInfo::kDim == -1, this means here are mismatch in expression + * if (ExpInfo::kDevMask & cpu::kDevMask) != 0, this means this expression can be assigned to cpu + * \tparam E expression + */ +template +struct ExpInfo { + static const int kDim = -1; + static const int kDevMask = 0; +}; +template +struct ExpInfo< ScalarExp > { + static const int kDim = 0; + static const int kDevMask = 0xffff; +}; +template +struct ExpInfo > { + static const int kDim = ExpInfo::kDim; + static const int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + static const int kDim = ExpInfo::kDim; + static const int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + static const int kDim = dim; + static const int kDevMask = Device::kDevMask; +}; +template +struct ExpInfo > { + static const int kDimSrc = ExpInfo::kDim; + static const int kDim = kDimSrc >= 0 ? dim : -1; + static const int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + static const int kDim = ExpInfo::kDim; + static const int kDevMask = ExpInfo::kDevMask; +}; +template +struct ExpInfo > { + static const int kDimLhs = ExpInfo::kDim; + static const int kDimRhs = ExpInfo::kDim; + static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\ + (kDimLhs == 0 ?\ + kDimRhs :\ + ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1; + static const int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; +}; +/*! \brief template to do type check */ +template +struct TypeCheck { + /*! \brief dimension of expression*/ + static const int kExpDim = ExpInfo::kDim; + /*! \brief whether the expression device type matches */ + static const bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; + /*! \brief whether the expression can be mapped to expression of dim */ + static const bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; + /*! \brief whether the expression can be reduced to expression of dim */ + static const bool kRedPass = (kExpDim > dim) && kDevPass; +}; +/*! \brief used to help static type check*/ +template +struct TypeCheckPass; +// Todo : add static assert using C++11 +template<> +struct TypeCheckPass {}; +template<> +struct TypeCheckPass { + inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {} + inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {} + inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {} +}; + +//---------------------------------------------------------------- +// Runtime Stream Getting +//---------------------------------------------------------------- +template +struct StreamInfo { + inline static Stream *Get(const E &t); +}; +template +struct StreamInfo > { + inline static Stream *Get(const Tensor &t) { + return t.stream_; + } +}; +//---------------------------------------------------------------- +// Runtime Shape Checking +//---------------------------------------------------------------- +/*! + * \brief runtime shape checking template + * get the shape of an expression, report error if shape mismatch + * \tparam dim the dimension of the shape + * \tparam E expression + */ +template +struct ShapeCheck { + inline static Shape Check(const E &t); +}; +template +struct ShapeCheck > { + inline static Shape Check(const ScalarExp &exp) { + // use lowest dimension to mark scalar exp + Shape shape; shape[0] = 0; + return shape; + } +}; +template +struct ShapeCheck > { + inline static Shape + Check(const TypecastExp &exp) { + return ShapeCheck::Check(exp.exp); + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const TransposeExp &e) { + // swap the lowest two dimensions + Shape s = ShapeCheck::Check(e.exp); + std::swap(s[0], s[1]); + return s; + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const Tensor &t) { + return t.shape_; + } +}; +template +struct ShapeCheck > { + inline static Shape + Check(const MakeTensorExp &t) { + return t.shape_; + } +}; +template +struct ShapeCheck > { + inline static Shape Check(const UnaryMapExp &t) { + Shape s = ShapeCheck::Check(t.src_); + return s; + } +}; +template +struct ShapeCheck > { + inline static Shape + Check(const BinaryMapExp &t) { + Shape shape1 = ShapeCheck::Check(t.lhs_); + Shape shape2 = ShapeCheck::Check(t.rhs_); + if (shape1[0] == 0) return shape2; + if (shape2[0] == 0) return shape1; + utils::Check(shape1 == shape2, + "BinaryMapExp: Shapes of operands are not the same"); + return shape1; + } +}; +} // namespace expr +} // namespace mshadow +// include definition of dot engine +#include "./dot_engine-inl.h" + +namespace mshadow { +namespace expr { +/*! \brief some engine that evaluate complex expression */ +template +struct ExpComplexEngine { + inline static void Eval(RV *dst, const E &exp); +}; +/*! \brief the engine that dispatches simple operations*/ +template +struct ExpEngine { + template + inline static void Eval(RV *dst, + const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(RV *dst, + const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(RV *dst, + const Exp &exp) { + MapExp(dst, exp); + } + template + inline static void Eval(RV *dst, + const Exp &exp) { + ExpComplexEngine::Eval(dst->ptrself(), exp.self()); + } +}; +template +struct ExpComplexEngine, + DotExp, + Tensor, + ltrans, rtrans, DType>, + DType> { + inline static void Eval(Tensor *dst, + const DotExp, + Tensor, + ltrans, rtrans, DType> &exp) { + DotEngine::Eval(dst, exp.lhs_, exp.rhs_, exp.scale_); + } +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXPR_ENGINE_INL_H_ diff --git a/mshadow/expr_scalar-inl.h b/mshadow/expr_scalar-inl.h new file mode 100644 index 000000000000..a0efdc1ab649 --- /dev/null +++ b/mshadow/expr_scalar-inl.h @@ -0,0 +1,123 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file expression-inl.h + * \brief definitions of operators in expression with respect to scalar + * this file will be included several times, each time with MACRO MSHADOW_SCALAR_ to be different types + * + * DO NOT add pragma once or macro guard + * \author Tianqi Chen, Bing Xu + */ +namespace mshadow { +namespace expr { +// DotExp +/*! \brief dot operator def */ +template +inline DotExp +operator*(const DotExp &lhs, + MSHADOW_SCALAR_ rhs) { + return DotExp(lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs); +} +/*! \brief scale of dot operation */ +template +inline DotExp +operator*(MSHADOW_SCALAR_ lhs, + const DotExp &rhs) { + return DotExp(rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs); +} + +/*! \brief operator overload */ +template +inline ReduceTo1DExp +operator*(const ReduceTo1DExp &e, MSHADOW_SCALAR_ scale) { + return ReduceTo1DExp(e.src_, e.scale_ * scale); +} +/*! \brief operator overload */ +template +inline ReduceTo1DExp +operator*(MSHADOW_SCALAR_ scale, const ReduceTo1DExp &e) { + return ReduceTo1DExp(e.src_, e.scale_ * scale); +} + +/*! \brief operator overload for const */ +template +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +F(const Exp &lhs, const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload for const */ +template +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +F(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +// constant operators +/*! \brief operator overload */ +template +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator+(const Exp &lhs, + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator-(const Exp &lhs, + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator*(const Exp &lhs, + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, + MSHADOW_SCALAR_, (ta|type::kMapper)> +operator/(const Exp &lhs, + const ScalarExp &rhs) { + return MakeExp(lhs, rhs); +} +// constant operators 2 +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator+(const ScalarExp &lhs, + const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator-(const ScalarExp &lhs, + const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator*(const ScalarExp &lhs, + const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp, TB, + MSHADOW_SCALAR_, (tb|type::kMapper)> +operator/(const ScalarExp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +} // namespace expr +} // namespace mshadow diff --git a/mshadow/expression.h b/mshadow/expression.h new file mode 100644 index 000000000000..d73c11f7d40c --- /dev/null +++ b/mshadow/expression.h @@ -0,0 +1,355 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file expression.h + * \brief definitions of abstract expressions and expressions template + * \author Tianqi Chen, Bing Xu + */ +#ifndef MSHADOW_EXPRESSION_H_ +#define MSHADOW_EXPRESSION_H_ +#include "./base.h" + +namespace mshadow { +/*! + * \brief namespace for abstract expressions and expressions template, + * have no dependecy on tensor.h, + * These data structure takes no charge in computations, + * they are only used to define operations and represent expression in a symbolic way + */ +namespace expr { +/*! \brief type of expressions */ +namespace type { +// type expression type are defined as bitmask +// subtype relationshop kRValue < kMapper < kPull < kComplex +/*! + * \brief this expression directly correspnds to a data class, + * can be used to assign data + */ +const int kRValue = 0; +/*! + * \brief expression contains element-wise tensor operations, + * map a expression to same shape + */ +const int kMapper = 1; +/*! + * \brief expression that can be chained with other expressiones + * Usually it have function Eval(i,j) defined, which pulls the result (i, j) from input + * expression and output the result at certain position. + */ +const int kChainer = 3; +/*! \brief othercase: e.g dot product */ +const int kComplex = 7; +} // namespace type +/*! + * \brief expression engine that actually interprets these expressions + * this is a function template that needed to be implemented for specific expressions + * \tparam Saver the save method + * \tparam RValue the type of RValue to be saved + * \sa namespace sv + */ +template +struct ExpEngine; +/*! \brief defines how expression exp can be evaluated and stored into dst */ +//template +//inline static void Eval(RValue *dst, const EType &exp); +/*! + * \brief base class for expression + * \tparam SubType inheritated class must put their type into this parameter + * \tparam DType the data type of each element in the expression + * \tparam exp_type expression type, see namespace type + */ +template +struct Exp { + public: + /*! \return subtype instance of current class */ + inline const SubType& self(void) const { + return *static_cast(this); + } + /*! \return reference of subtype instance of current class */ + inline SubType* ptrself(void) { + return static_cast(this); + } +}; +/*! + * \brief scalar expression + * \tparam DType the data type of the scalar + */ +template +struct ScalarExp: public Exp, DType, type::kMapper> { + /*! \brief scalar value */ + DType scalar_; + /*! \brief implicit constructor, MUST NOT BE explicit */ + ScalarExp(DType scalar) : scalar_(scalar) {} +}; +/*! \brief create an scalar expression */ +template +inline ScalarExp scalar(DType s) { + return ScalarExp(s); +} +/*! + * \brief typecast expression, cast the type of elements + * \tparam DstDType the target type we want to cast into + * \tparam SrcDType the target type we want to cast from + * \tparam EType the type of the source expression + * \tparam etype the type of expression after cast + */ +template +struct TypecastExp: + public Exp, + DstDType, etype> { + /*! \brief expression to be typecasted */ + const EType &exp; + /*! \brief constructor */ + explicit TypecastExp(const EType &e) : exp(e) {} +}; +/*! \brief create an scalar expression */ +template +inline TypecastExp +tcast(const Exp &exp) { + return TypecastExp(exp.self()); +} +/*! \brief represent a transpose expression of a container */ +template +struct TransposeExp: public Exp, + DType, type::kChainer> { + /*! \brief expression to be transposed */ + const EType &exp; + /*! \brief constructor */ + explicit TransposeExp(const EType &e) : exp(e) {} + /*! \brief transpose expression */ + inline const EType &T(void) const { + return exp; + } +}; +/*! + * \brief base class of all rvalues + * \tparam Container the actually class of data container, e.g. Tensor1D + * \tparam DataType the element data type of each element in the container + */ +template +class RValueExp: public Exp { + public: + /*! + *\brief transpose of a matrix + *\return transpose of current expression + */ + inline const TransposeExp T(void) const { + return TransposeExp(this->self()); + } + /*! \brief operator overload */ + inline Container &operator+=(DType s) { + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); + } + /*! \brief operator overload */ + inline Container &operator-=(DType s) { + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); + } + /*! \brief operator overload */ + inline Container &operator*=(DType s) { + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); + } + /*! \brief operator overload */ + inline Container &operator/=(DType s) { + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); + } + /*! \brief operator overload */ + inline Container &__assign(DType s) { + ExpEngine::Eval(this->ptrself(), scalar(s)); + return *(this->ptrself()); + } + /*! \brief we can not define container = container */ + template + inline Container &__assign(const Exp &exp) { + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); + } + /*! \brief operator overload, assign */ + inline Container &__assign(const Exp &exp); + /*! \brief implementation of operator+= */ + template + inline Container &operator+=(const Exp &exp) { + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); + } + /*! \brief implementation of operator-= */ + template + inline Container &operator-=(const Exp &exp) { + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); + } + /*! \brief implementation of operator*= */ + template + inline Container &operator*=(const Exp &exp) { + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); + } + /*! \brief implementation of operator/= */ + template + inline Container &operator/=(const Exp &exp) { + ExpEngine::Eval(this->ptrself(), exp.self()); + return *(this->ptrself()); + } +}; +/*! + * \brief matrix multiplication expression dot(lhs[.T], rhs[.T]) + * \tparam TA type of lhs + * \tparam TB type of rhs + * \tparam ltrans whether lhs is transposed + * \tparam rtrans whether rhs is transposed + * \tparam DType the data type of the scalar + */ +template +struct DotExp: public Exp, + DType, type::kComplex> { + /*! \brief left operand */ + const TA &lhs_; + /*! \brief right operand */ + const TB &rhs_; + /*! \brief scale over result */ + DType scale_; + /*! \brief constructor */ + explicit DotExp(const TA &lhs, const TB &rhs, DType scale) + : lhs_(lhs), rhs_(rhs), scale_(scale) {} +}; +// definition of dot expression +/*! \brief dot operator def */ +template +inline DotExp +dot(const RValueExp &lhs, const RValueExp &rhs) { + return DotExp(lhs.self(), rhs.self(), 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +dot(const TransposeExp &lhs, const RValueExp &rhs) { + return DotExp(lhs.exp, rhs.self(), 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +dot(const RValueExp &lhs, const TransposeExp &rhs) { + return DotExp(lhs.self(), rhs.exp, 1.0f); +} +/*! \brief dot operator def */ +template +inline DotExp +dot(const TransposeExp &lhs, const TransposeExp &rhs) { + return DotExp(lhs.exp, rhs.exp, 1.0f); +} +//--------------- +// BinaryMapExp +// -------------- +/*! + * \brief binary map expression lhs [op] rhs + * \tparam OP operator + * \tparam TA type of lhs + * \tparam TB type of rhs + * \tparam etype expression type, sa namespace::type + */ +template +struct BinaryMapExp: public Exp, + DType, etype> { + /*! \brief left operand */ + const TA &lhs_; + /*! \brief right operand */ + const TB &rhs_; + /*! \brief constructor */ + explicit BinaryMapExp(const TA &lhs, const TB &rhs) + :lhs_(lhs), rhs_(rhs) {} +}; + +/*! \brief make expression */ +template +inline BinaryMapExp +MakeExp(const Exp &lhs, const Exp &rhs) { + return BinaryMapExp(lhs.self(), rhs.self()); +} +/*! + * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression + * \param lhs left operand + * \param rhs right operand + * \return the result expression + * \tparam binary operator + * \tparam TA lhs expression + * \tparam ta lhs expression type + * \tparam TB rhs expression + * \tparam tb rhs expression type + * \sa mshadow::op + */ +template +inline BinaryMapExp +F(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +// operator rules +/*! \brief operator overload */ +template +inline BinaryMapExp +operator+(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp +operator-(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp +operator*(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +/*! \brief operator overload */ +template +inline BinaryMapExp +operator/(const Exp &lhs, const Exp &rhs) { + return MakeExp(lhs, rhs); +} +//--------------- +// UnaryMapExp +// -------------- +/*! + * \brief unary map expression op(src) + * \tparam OP operator + * \tparam TA type of src + * \tparam etype expression type, sa namespace::type + */ +template +struct UnaryMapExp: public Exp, + DType, etype> { + /*! \brief source expression */ + const TA &src_; + /*! \brief constructor */ + explicit UnaryMapExp(const TA &src) : src_(src) {} +}; + +/*! \brief make expression */ +template +inline UnaryMapExp +MakeExp(const Exp &src) { + return UnaryMapExp(src.self()); +} +/*! + * \brief short hand for MakeExp, usage F(src), create a unary operation expression + * \param src source expression + * \return the result expression + * \tparam operator + * \tparam TA source expression + * \tparam ta source expression type + * \sa mshadow::op + */ +template +inline UnaryMapExp +F(const Exp &src) { + return MakeExp(src); +} +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXPRESSION_H_ diff --git a/mshadow/extension.h b/mshadow/extension.h new file mode 100644 index 000000000000..882d367a72dd --- /dev/null +++ b/mshadow/extension.h @@ -0,0 +1,26 @@ +/*! + * + * \file extension.h + * \brief some extension of expressions, + * used to support something beyond elementwise op + * \author Tianqi Chen, Bing Xu + */ +#ifndef MSHADOW_EXTENSION_H_ +#define MSHADOW_EXTENSION_H_ +#include "./expr_engine-inl.h" +#include "./extension/broadcast.h" +#include "./extension/unpack_patch2col.h" +#include "./extension/pack_col2patch.h" +#include "./extension/reshape.h" +#include "./extension/swapaxis.h" +#include "./extension/reduceto1d.h" +#include "./extension/spatial_pool.h" +#include "./extension/spatial_unpool.h" +#include "./extension/channel_pool.h" +#include "./extension/channel_unpool.h" +#include "./extension/pad.h" +#include "./extension/crop.h" +#include "./extension/mirror.h" +#include "./extension/concat.h" +#endif + diff --git a/mshadow/extension/broadcast.h b/mshadow/extension/broadcast.h new file mode 100644 index 000000000000..9a8b57bffc7d --- /dev/null +++ b/mshadow/extension/broadcast.h @@ -0,0 +1,107 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file broadcast.h + * \brief support for broadcast and repmat + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_BROADCAST_H_ +#define MSHADOW_EXTENSION_BROADCAST_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief broadcast Tensor1D into a higher dimension Tensor + * input: Tensor: ishape[0] + * output: Tensor : oshape[dimcast] = ishape[0] + * \tparam SrcExp type of input expression + * \tparam DType the type of elements + * \tparam dimdst target tensor dimension + * \tparam dimcast_m_dst dimcast - dimdst + */ +template +struct Broadcast1DExp: + public MakeTensorExp, + SrcExp, dimdst, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief constructor */ + Broadcast1DExp(const SrcExp &src, Shape shape) + : src_(src) { + this->shape_ = shape; + } +}; +/*! + * \brief a expression that replicate a 1 dimension tensor in dimension dimcast + * \param src Tensor: shape[0] + * \param shape shape of output + * \return a expresion with type Tensor + * \tparam dimcast target dimension where the 1D tensor will be broadcasted + * \tparam SrcExp type of input expression + * \tparam DType the type of elements + * \tparam dimdst dimension of destination tensor + * \tparam dimcast_lowest the dimension we want to cast the data into + */ +template +inline Broadcast1DExp +broadcast(const expr::Exp &src, Shape shape) { + TypeCheckPass::kDim == 1> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(ShapeCheck<1, SrcExp>::Check(src.self())[0] == shape[dimcast], + "broadcast, shape mismatch"); + return Broadcast1DExp(src.self(), shape); +} +// short cut functions +/*! + * \brief a expression that replicate a 1 dimension tensor for nrow times + * \param src Tensor: shape[0] + * \param nrow number of rows to replicate + * \return a expresion with type Tensor size(1), size(0) = nrow + * \tparam Device which device it lies + */ +template +inline Broadcast1DExp +repmat(const expr::Exp &src, index_t nrow) { + return broadcast<1> + (src, Shape2(nrow, ShapeCheck<1, SrcExp>::Check(src.self())[0])); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + static const int dimcast = dimdst - dimdst_m_cast; + explicit Plan(const Broadcast1DExp &e) + : src_(MakePlan(e.src_)), + ystride_(e.shape_.ProdShape(dimcast + 1, dimdst - 1)), + length_(e.shape_[dimcast]) { + TypeCheckPass + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + } + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(0, (y / ystride_) % length_); + } + + private: + expr::Plan src_; + const index_t ystride_, length_; +}; + +/*! \brief execution plan of Broadcast1DExp */ +template +struct Plan, DType>{ + public: + explicit Plan(const Broadcast1DExp &e) + : src_(MakePlan(e.src_)) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(0, x); + } + + private: + expr::Plan src_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_BROADCAST_H_ diff --git a/mshadow/extension/channel_pool.h b/mshadow/extension/channel_pool.h new file mode 100644 index 000000000000..4039d1d6303a --- /dev/null +++ b/mshadow/extension/channel_pool.h @@ -0,0 +1,108 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file channel_pool.h + * \brief support for chpool + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_CHANNEL_POOL_H_ +#define MSHADOW_EXTENSION_CHANNEL_POOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief channel pooling expression, do reduction over (local nearby) channels, + * used to implement local response normalization + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct ChannelPoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief neighbor size */ + index_t nsize_; + /*! \brief stride of pooling */ + index_t stride_; + /*! \brief pad of pooling of each side */ + index_t pad_; + index_t src_channel_; + /*! \brief constructor */ + ChannelPoolingExp(const SrcExp &src, index_t nsize, index_t stride, index_t pad) + : src_(src), nsize_(nsize), stride_(stride), pad_(pad) { + this->shape_ = ShapeCheck::Check(src_); + this->src_channel_ = this->shape_[srcdim - 3]; + utils::Check(this->shape_[srcdim - 3] >= nsize_, + "chpool: local size must be smaller than nchannels"); + this->shape_[srcdim - 3] = (this->src_channel_ - nsize + pad * 2 + 1) / stride; + } +}; +/*! + * \brief channel pooling, do reduction over (local nearby) channels, + * used to implement local response normalization + * \param src source data + * \param nsize neighbor size + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline ChannelPoolingExp::kDim> +chpool(const Exp &src, index_t nsize) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(nsize % 2 == 1, + "chpool: if no pad is specified, local size must be odd"); + return ChannelPoolingExp::kDim>(src.self(), nsize, 1, nsize / 2); +} + +template +inline ChannelPoolingExp::kDim> +chpool(const Exp &src, index_t nsize, index_t stride, index_t pad) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelPoolingExp::kDim>(src.self(), nsize, stride, pad); +} + +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ChannelPoolingExp &e) + : src_(MakePlan(e.src_)), channel_(e.shape_[srcdim - 3]), + height_(e.shape_[srcdim - 2]), width_(e.shape_[srcdim - 1]), + hnsize_(e.nsize_), stride_(e.stride_), pad_(e.pad_), + src_channel_(e.src_channel_){} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using namespace std; + const index_t y = i % height_; + i /= height_; + const index_t c = i % channel_; + const index_t n = i / channel_; + const index_t x = j; + const index_t cstart = c * stride_ < pad_ ? 0 : c * stride_ - pad_; + const index_t cend = min(cstart + hnsize_, channel_); + DType res; Reducer::SetInitValue(res); + for (index_t cc = cstart; cc < cend; ++cc) { + Reducer::Reduce(res, src_.Eval((n * src_channel_ + cc) * height_ + y, x)); + } + return res; + } + private: + Plan src_; + const index_t channel_, height_, width_, hnsize_, stride_, pad_, src_channel_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CHANNEL_POOL_H_ + diff --git a/mshadow/extension/channel_unpool.h b/mshadow/extension/channel_unpool.h new file mode 100644 index 000000000000..6257391d2fd0 --- /dev/null +++ b/mshadow/extension/channel_unpool.h @@ -0,0 +1,132 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file channel_pool.h + * \brief support for chpool + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_ +#define MSHADOW_EXTENSION_CHANNEL_UNPOOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief channel pooling expression, do reduction over (local nearby) channels, + * used to implement local response normalization + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct ChannelUnpoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source input, corresponds to src in pooling */ + const SrcExp &data_src_; + /*! \brief result of pooled data, corresponds to result of pooling */ + const SrcExp &data_pooled_; + /*! \brief gradient data of pooled part, to be propgate down */ + const SrcExp &grad_pooled_; + /*! \brief channel of pooled expression */ + index_t pchannel_; + /*! \brief kernel size in height */ + index_t nsize_; + /*! \brief kernel size in width */ + index_t kstride_; + /*! \brief pad */ + index_t pad_; + /*! \brief constructor */ + ChannelUnpoolingExp(const SrcExp &data_src, + const SrcExp &data_pooled, + const SrcExp &grad_pooled, + index_t nsize, index_t kstride, index_t pad) + : data_src_(data_src), data_pooled_(data_pooled), + grad_pooled_(grad_pooled), + nsize_(nsize), kstride_(kstride), pad_(pad) { + Shape pshape = ShapeCheck::Check(grad_pooled); + utils::Check(pshape == ShapeCheck::Check(data_pooled), + "ChannelUnPoolingExp: data and grad shape mismatch"); + Shape sshape = ShapeCheck::Check(data_src); + for (int k = 0; k < srcdim; ++k) { + if (k == 1){ + continue; + } + utils::Check(pshape[k] == sshape[k], + "ChannelUnPoolingExp: pooled tensor and src tensor shape mismatch"); + } + pchannel_ = pshape[1]; + this->shape_ = sshape; + } +}; +/*! + * \brief channel unpooling, do unroll over (local nearby) channels + * \param src source data + * \param nsize neighbor size + * \param stride stride of the pooling + * \param pad number of padding at each side + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline ChannelUnpoolingExp::kDim> +ch_unpool(const Exp &data_src, + const Exp &data_pooled, + const Exp &grad_pooled, + index_t nsize, index_t stride, index_t pad) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelUnpoolingExp::kDim> + (data_src.self(), data_pooled.self(), grad_pooled.self(), nsize, stride, pad); +} + +template +inline ChannelUnpoolingExp::kDim> +ch_unpool(const Exp &data_src, + const Exp &data_pooled, + const Exp &grad_pooled, index_t nsize) { + return ch_unpool(data_src, data_pooled, grad_pooled, nsize, 1, nsize / 2); +} + + +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ChannelUnpoolingExp &e) + : data_src_(e.data_src_), data_pooled_(e.data_pooled_), + grad_pooled_(e.grad_pooled_), channel_(e.shape_[srcdim - 3]), + height_(e.shape_[srcdim - 2]), pchannel_(e.pchannel_), + hnsize_(e.nsize_), stride_(e.kstride_), pad_(e.pad_){} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using namespace std; + const DType vsrc = data_src_.Eval(i, j); + const index_t y = i % height_; + i /= height_; + const index_t c = i % channel_; + const index_t n = i / channel_; + const index_t x = j; + const index_t cstart = c < hnsize_ - pad_ ? 0 + : (c - (hnsize_ - pad_) + stride_) / stride_; + const index_t cend = min((c + pad_ + stride_) / stride_, channel_); + DType val = static_cast(0); + for (index_t cc = cstart; cc < cend; ++cc) { + val += Reducer::PartialGrad(vsrc, + data_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x)) * + grad_pooled_.Eval((n * pchannel_ + cc) * height_ + y, x); + } + return val; + } + private: + Plan data_src_, data_pooled_, grad_pooled_; + const index_t channel_, height_, pchannel_, hnsize_, stride_, pad_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CHANNEL_POOL_H_ + diff --git a/mshadow/extension/concat.h b/mshadow/extension/concat.h new file mode 100644 index 000000000000..e7ae27735a0f --- /dev/null +++ b/mshadow/extension/concat.h @@ -0,0 +1,177 @@ +#ifndef MSHADOW_EXTENSION_CONCAT_H_ +#define MSHADOW_EXTENSION_CONCAT_H_ + +#include "../extension.h" + +namespace mshadow { +namespace expr { +/*! + * \brief concat expression, concat two tensor's channel + * \tparam LhsExp left expression + * \tparam RhsExp right expression + * \tparam DType the type of elements + * \tparam srcdim dimension of src + * \tparam dimsrc_m_cat dimsrc - dimcat + */ +template +struct ConcatExp : public TRValue, + Device, srcdim, DType> { + static const int dimcat = srcdim - dimsrc_m_cat; + const LhsExp &src1_; + const RhsExp &src2_; + index_t dcat_src1_; + index_t dcat_src2_; + Shape<4> shape_; + ConcatExp(const LhsExp &src1, const RhsExp &src2) : src1_(src1), src2_(src2) { + Shape sshape1 = ShapeCheck::Check(src1_); + Shape sshape2 = ShapeCheck::Check(src2_); + #pragma unroll + for (int i = 0; i < srcdim; ++i) { + if (i != dimcat) { + utils::Check(sshape1[i] == sshape2[i], + "ConcatExp: shape mismatch"); + } + } + this->shape_ = sshape1; + this->shape_[dimcat] = sshape1[dimcat] + sshape2[dimcat]; + this->dcat_src1_ = sshape1[dimcat]; + this->dcat_src2_ = sshape2[dimcat]; + } + template + inline void + operator=(const expr::Exp &exp) { + this->__assign(exp); + } + inline void + operator=(const DType &exp) { + this->__assign(exp); + } +}; // struct ConcatExp +/*! + * \brief concat two 4D tensor + * \param src1 source tensor1 + * \param src2 source tensor2 + * \return concated 4D tensor + * \tparam cdim the dimension to concatnate on + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline ConcatExp +concat(const TRValue &src1, + const TRValue &src2) { + TypeCheckPass::kDim == ExpInfo::kDim> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + TypeCheckPass::kDim == srcdim> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ConcatExp + (src1.self(), src2.self()); +} +//------------------------ +// engine plugin +//------------------------ +// runtime shapecheck +template +struct ShapeCheck >{ + inline static Shape Check(const ConcatExp &t) { + return t.shape_; + } +}; +template +struct StreamInfo >{ + inline static Stream * + Get(const ConcatExp &t) { + Stream *lhs = StreamInfo::Get(t.src1_); + Stream *rhs = StreamInfo::Get(t.src2_); + if (lhs != rhs) return NULL; + return lhs; + } +}; +// static typecheck +template +struct ExpInfo >{ + static const int kDimLhs = ExpInfo::kDim; + static const int kDimRhs = ExpInfo::kDim; + // copy from binarymap + static const int kDim = (kDimLhs >= 0 && kDimRhs >= 0) ?\ + (kDimLhs == 0 ?\ + kDimRhs :\ + ((kDimRhs == 0 || kDimLhs == kDimRhs) ? kDimLhs : -1)) : -1; + static const int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; +}; +//---------------------- +// Execution plan +//--------------------- +template +struct Plan, DType> { + public: + static const int dimcat = srcdim - dimsrc_m_cat; + explicit Plan(const ConcatExp &e) + : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), + height_(e.shape_.ProdShape(dimcat + 1, srcdim - 1)), + ch_src1_(e.dcat_src1_), ch_src2_(e.dcat_src2_), ch_(e.shape_[dimcat]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t y = i % height_; + i /= height_; + const index_t c = i % ch_; + const index_t b = i / ch_; + const index_t x = j; + if (c < ch_src1_) return src1_.Eval((b * ch_src1_ + c) * height_ + y, x); + else return src2_.Eval((b * ch_src2_ + c - ch_src1_) * height_ + y, x); + } + MSHADOW_XINLINE DType &REval(index_t i, index_t j) { + const index_t y = i % height_; + i /= height_; + const index_t c = i % ch_; + const index_t b = i / ch_; + const index_t x = j; + if (c < ch_src1_) return src1_.REval((b * ch_src1_ + c) * height_ + y, x); + else return src2_.REval((b * ch_src2_ + c - ch_src1_) * height_ + y, x); + } + + private: + Plan src1_; + Plan src2_; + const index_t height_, ch_src1_, ch_src2_, ch_; +}; // struct Plan + +// specialize for concat in x +template +struct Plan, DType> { + public: + explicit Plan(const ConcatExp &e) + : src1_(MakePlan(e.src1_)), src2_(MakePlan(e.src2_)), + width_src1_(e.dcat_src1_) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + if (x < width_src1_) return src1_.Eval(y, x); + else return src2_.Eval(y, x - width_src1_); + } + MSHADOW_XINLINE DType &REval(index_t y, index_t x) { + if (x < width_src1_) return src1_.REval(y, x); + else return src2_.REval(y, x - width_src1_); + } + + private: + Plan src1_; + Plan src2_; + const index_t width_src1_; +}; +}// namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CONCAT_H_ diff --git a/mshadow/extension/crop.h b/mshadow/extension/crop.h new file mode 100644 index 000000000000..d740d7bb18c9 --- /dev/null +++ b/mshadow/extension/crop.h @@ -0,0 +1,121 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file crop.h + * \brief support for crop + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_CROP_H_ +#define MSHADOW_EXTENSION_CROP_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief crop expression, cut off the boundary region, reverse operation of padding + * \tparam SrcExp source expression to be pooled from + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct CroppingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief pad height */ + index_t pad_height_; + /*! \brief pad height */ + index_t pad_width_; + /*! \brief src height */ + index_t src_height_; + /*! \brief constructor */ + explicit CroppingExp(const SrcExp &src, Shape<2> cshape) + : src_(src) { + this->shape_ = ShapeCheck::Check(src_); + utils::Check(this->shape_[srcdim - 2] >= cshape[0], + "CroppingExp: height requirement not met"); + utils::Check(this->shape_[srcdim - 1] >= cshape[1], + "CroppingExp: width requirement not met"); + pad_height_ = (this->shape_[srcdim - 2] - cshape[0]) / 2; + pad_width_ = (this->shape_[srcdim - 1] - cshape[1]) / 2; + src_height_ = this->shape_[srcdim - 2]; + this->shape_[srcdim - 2] = cshape[0]; // height + this->shape_[srcdim - 1] = cshape[1]; // width + } + /*! \brief constructor */ + explicit CroppingExp(const SrcExp &src, Shape<2> cshape, + index_t start_height, index_t start_width) + : src_(src), pad_height_(start_height), pad_width_(start_width) { + this->shape_ = ShapeCheck::Check(src_); + utils::Check(this->shape_[srcdim - 2] >= cshape[0] + start_height, + "CroppingExp: height requirement not met"); + utils::Check(this->shape_[srcdim - 1] >= cshape[1] + start_width, + "CroppingExp: width requirement not met"); + src_height_ = this->shape_[srcdim - 2]; + this->shape_[srcdim - 2] = cshape[0]; // height + this->shape_[srcdim - 1] = cshape[1]; // width + } +}; // struct CroppingExp +/*! + * \brief revserse operationg of padding, cut off boundaries, + * crop output from center of input + * \param src original image batches + * \param oshape output shape to be cropped + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline CroppingExp::kDim> +crop(const Exp &src, Shape<2> oshape) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return CroppingExp::kDim>(src.self(), oshape); +} +/*! + * \brief same as crop, but can specify starting position to do cropping + * \param src original image batches + * \param oshape output shape to be cropped + * \param start_height start height position to do cropping + * \param start_width start width position to do cropping + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline CroppingExp::kDim> +crop(const Exp &src, Shape<2> oshape, + index_t start_height, index_t start_width) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return CroppingExp::kDim> + (src.self(), oshape, start_height, start_width); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const CroppingExp &e) + : src_(MakePlan(e.src_)), + pad_height_(e.pad_height_), pad_width_(e.pad_width_), + new_height_(e.shape_[srcdim - 2]), src_height_(e.src_height_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t x = j; + const index_t y = i % new_height_; + const index_t c = i / new_height_; + const index_t h = y + pad_height_; + const index_t w = x + pad_width_; + return src_.Eval(c * src_height_ + h, w); + } + private: + Plan src_; + const index_t pad_height_, pad_width_; + const index_t new_height_; + const index_t src_height_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_CROP_H_ diff --git a/mshadow/extension/mirror.h b/mshadow/extension/mirror.h new file mode 100644 index 000000000000..9e9edc9b6f70 --- /dev/null +++ b/mshadow/extension/mirror.h @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file mirror.h + * \brief support for mirror + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_MIRROR_H_ +#define MSHADOW_EXTENSION_MIRROR_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief mirror expression, mirror a image in width + * \tparam SrcExp source expression to be mirrored + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct MirroringExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief constructor */ + explicit MirroringExp(const SrcExp &src) : src_(src) { + this->shape_ = ShapeCheck::Check(src_); + } +}; +/*! + * \brief mirroring expression, mirror images in width + * \param src original image batches + * \return expression corresponding to mirrored result + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline MirroringExp::kDim> +mirror(const Exp &src) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return MirroringExp::kDim>(src.self()); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const MirroringExp &e) + : src_(MakePlan(e.src_)), width_(e.shape_[srcdim - 1]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + return src_.Eval(i, width_ - j - 1); + } + + private: + Plan src_; + const index_t width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_MIRROR_H_ diff --git a/mshadow/extension/pack_col2patch.h b/mshadow/extension/pack_col2patch.h new file mode 100644 index 000000000000..28001b42c9e6 --- /dev/null +++ b/mshadow/extension/pack_col2patch.h @@ -0,0 +1,119 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file pack_col2patch.h + * \brief support for pack + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_PACK_COL2PATCH_H_ +#define MSHADOW_EXTENSION_PACK_COL2PATCH_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief reverse operation of UnpackPatchToCol, + * used to backprop gradient back + * this is a version supporting multiple images + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam dstdim destination dimension + */ +template +struct PackColToPatchXExp: + public MakeTensorExp, + SrcExp, dstdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief patch height */ + index_t psize_y_; + /*! \brief patch height */ + index_t psize_x_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief constructor */ + PackColToPatchXExp(const SrcExp &src, Shape imshape, + index_t psize_y, index_t psize_x, index_t pstride) + :src_(src), psize_y_(psize_y), psize_x_(psize_x), pstride_(pstride){ + this->shape_ = imshape; + const index_t o_height = (imshape[dstdim - 2] - psize_y) / pstride + 1; + const index_t o_width = (imshape[dstdim - 1] - psize_x) / pstride + 1; + Shape<2> sshape = ShapeCheck<2, SrcExp>::Check(src_); + utils::Check(sshape[1] == o_height * o_width * + imshape.ProdShape(0, dstdim - 3), + "PackColToPatchExp: src.size(1) mismatch"); + utils::Check(sshape[0] == psize_y * psize_x * imshape[dstdim - 3], + "PackColToPatchExp: src.size(0) mismatch"); + } +}; +/*! + * \brief reverse operation of pack_col2patch, can be used to implement deconvolution + * \return packed img expression + * \param mat source matrix + * \param imshape shape of target img + * \param psize_y height of each patch + * \param psize_x height of each patch + * \param pstride stride of each patch + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam dstdim destination dimension + * \tparam etype type of expression + */ +template +inline PackColToPatchXExp +pack_col2patch(const expr::Exp &src, + Shape imshape, index_t psize_y, + index_t psize_x, index_t pstride) { + TypeCheckPass::kDim == 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Check(imshape[dstdim - 1] >= psize_x && + imshape[dstdim - 2] >= psize_y, + "PackColToPatch:image shape smaller than patch size"); + return PackColToPatchXExp(src.self(), imshape, + psize_y, psize_x, pstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const PackColToPatchXExp &e) + :src_(MakePlan(e.src_)), psize_y_(e.psize_y_), + psize_x_(e.psize_x_), pstride_(e.pstride_), + i_channel_(e.shape_[dstdim - 3]), i_height_(e.shape_[dstdim - 2]), + o_height_((e.shape_[dstdim - 2] - psize_y_) / pstride_ + 1), + o_width_((e.shape_[dstdim - 1] - psize_x_) / pstride_ + 1) { + // note: i/o convention are same as unpack + } + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using namespace std; + const index_t y = i % i_height_; + const index_t idivh = i / i_height_; + const index_t c = idivh % i_channel_; + const index_t n = idivh / i_channel_; + const index_t x = j; + const index_t py_min = + y < psize_y_ ? 0 : (y-psize_y_ + pstride_) / pstride_; + const index_t px_min = + x < psize_x_ ? 0 : (x-psize_x_ + pstride_) / pstride_; + const index_t py_max = min((y + pstride_) / pstride_, o_height_); + const index_t px_max = min((x + pstride_) / pstride_, o_width_); + DType res = static_cast(0); + for (index_t py = py_min; py < py_max; ++py) { + for (index_t px = px_min; px < px_max; ++px) { + res += src_.Eval(((c * psize_y_ + y - py*pstride_) * psize_x_ + + x - px * pstride_), + (n * o_height_ + py) * o_width_ + px); + } + } + return res; + } + + private: + Plan src_; + const index_t psize_y_, psize_x_, pstride_, i_channel_; + const index_t i_height_, o_height_, o_width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_PACK_COL2PATCH_H_ diff --git a/mshadow/extension/pad.h b/mshadow/extension/pad.h new file mode 100644 index 000000000000..6622a022acc8 --- /dev/null +++ b/mshadow/extension/pad.h @@ -0,0 +1,111 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file pad.h + * \brief support for pad + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_PAD_H_ +#define MSHADOW_EXTENSION_PAD_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief padding expression, pad a image with zeros + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam srcdim dimension of src + */ +template +struct PaddingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief pad size in y */ + index_t pad_y_; + /*! \brief pad size in x */ + index_t pad_x_; + /*! \brief source tensor height */ + index_t src_height_; + /*! \brief source tensor width */ + index_t src_width_; + /*! \brief constructor */ + PaddingExp(const SrcExp &src, index_t pad_y, index_t pad_x) + : src_(src), pad_y_(pad_y), pad_x_(pad_x) { + this->shape_ = ShapeCheck::Check(src_); + src_height_ = this->shape_[srcdim - 2]; + src_width_ = this->shape_[srcdim - 1]; + this->shape_[srcdim - 2] += pad_y * 2; // height + this->shape_[srcdim - 1] += pad_x * 2; // width + } +}; +/*! + * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] + * \param src original image batches + * \param pad padding size + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PaddingExp::kDim> +pad(const Exp &src, index_t pad) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PaddingExp::kDim>(src.self(), pad, pad); +} +/*! + * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] + * \param src original image batches + * \param pad_y padding size in y + * \param pad_x padding size in x + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PaddingExp::kDim> +pad(const Exp &src, index_t pad_y, index_t pad_x) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PaddingExp::kDim> + (src.self(), pad_y, pad_x); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const PaddingExp &e) + : src_(MakePlan(e.src_)), + pad_y_(e.pad_y_), pad_x_(e.pad_x_), + new_height_(e.shape_[srcdim - 2]), + src_height_(e.src_height_), src_width_(e.src_width_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t x = j; + const index_t y = i % new_height_; + const index_t c = i / new_height_; + if (y < pad_y_ || x < pad_x_) return static_cast(0); + const index_t h = y - pad_y_; + const index_t w = x - pad_x_; + if (h < src_height_ && w < src_width_) { + return src_.Eval(c * src_height_ + h, w); + } else { + return static_cast(0); + } + } + + private: + Plan src_; + const index_t pad_y_; + const index_t pad_x_; + const index_t new_height_; + const index_t src_height_; + const index_t src_width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_PAD_H_ diff --git a/mshadow/extension/reduceto1d.h b/mshadow/extension/reduceto1d.h new file mode 100644 index 000000000000..b35e88c3153f --- /dev/null +++ b/mshadow/extension/reduceto1d.h @@ -0,0 +1,89 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file reduceto1d.h + * \brief support for sum_rows and sumall_except_dim + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_REDUCETO1D_H_ +#define MSHADOW_EXTENSION_REDUCETO1D_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief reduction to 1 dimension tensor + * input: Tensor: ishape + * output: Tensor shape[0] = ishape[dimkeep]; + * + * \tparam SrcExp type of expression to be reduced + * \tparam DType the data type of the scalar + * \tparam Reducer which reducer to use + * \tparam m_dimkeep which dimension to be kept, encoded with dimsrc - dimkeep + */ +template +struct ReduceTo1DExp: + public Exp, + DType, type::kComplex> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief source operand, scale of the */ + DType scale_; + /*! \brief construct a repmat expression from src and nrow */ + ReduceTo1DExp(const SrcExp& src, DType scale) : src_(src), scale_(scale) {} +}; +/*! + * \brief a sum over all dimensions, except dimkeep + * \param exp input expression that must be a matrix Tensor + * \return a expresion with type Tensor + * \tparam dimkeep the dimension that will be kept + * \tparam SrcExp expression + * \tparam etype type of expression + */ +template +inline ReduceTo1DExp::kDim - dimkeep> +sumall_except_dim(const Exp &exp) { + return ReduceTo1DExp::kDim - dimkeep>(exp.self(), 1); +} +/*! + * \brief a expression that sum over rows of a matrix + * \param exp input expression that must be a matrix Tensor + * \return a expresion with type Tensor + * \tparam SrcExp expression + * \tparam etype type of expression + */ +template +inline ReduceTo1DExp +sum_rows(const Exp &exp) { + TypeCheckPass::kDim ==2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return sumall_except_dim<1>(exp); +} +template +struct ExpComplexEngine, + ReduceTo1DExp, + DType> { + static const int dimkeep = ExpInfo::kDim - m_dimkeep; + inline static void Eval(Tensor *dst, + const ReduceTo1DExp &exp) { + TypeCheckPass + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + MapReduceKeepHighDim(dst, exp.src_, exp.scale_); + } +}; +template +struct ExpComplexEngine, + ReduceTo1DExp, DType> { + inline static void Eval(Tensor *dst, + const ReduceTo1DExp &exp) { + MapReduceKeepLowest(dst, exp.src_, exp.scale_); + } +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_REDUCETO1D_H_ diff --git a/mshadow/extension/reshape.h b/mshadow/extension/reshape.h new file mode 100644 index 000000000000..738e98f0e2c9 --- /dev/null +++ b/mshadow/extension/reshape.h @@ -0,0 +1,87 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file reshape.h + * \brief support for reshape + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_RESHAPE_H_ +#define MSHADOW_EXTENSION_RESHAPE_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief reshape the content to another shape + * input: Tensor: ishape + * output: Tensor ishape.Size() == oshape.Size() + * \tparam SrcExp source expression + * \tparam dimdst target dimension + * \tparam dimsrc source dimension + */ +template +struct ReshapeExp: + public MakeTensorExp, + SrcExp, dimdst, DType> { + /*! \brief source expression */ + const SrcExp &src_; + /*! \brief smallest dimension of input */ + index_t ishapex_; + /*! \brief constructor */ + ReshapeExp(const SrcExp &src, Shape shape) + : src_(src) { + Shape ishape = ShapeCheck::Check(src_); + utils::Check(ishape.Size() == shape.Size(), "reshape size must match"); + ishapex_ = ishape[dimsrc - 1]; + this->shape_ = shape; + } +}; +/*! + * \brief a expression that reshapes a tensor to another shape + * \param src Tensor: + * \param oshape target shape + * \return a expresion with type Tensor + * \tparam SrcExp source expression + * \tparam etype source expression type + * \tparam dimdst target dimension + */ +template +inline ReshapeExp::kDim> +reshape(const Exp &src, Shape oshape) { + return ReshapeExp::kDim> + (src.self(), oshape); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const ReshapeExp &e) + : src_(MakePlan(e.src_)), + oshapex_(e.shape_[dimdst - 1]), ishapex_(e.ishapex_) {} + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + const index_t idx = y * oshapex_ + x; + return src_.Eval(idx / ishapex_, idx % ishapex_); + } + + private: + Plan src_; + const index_t oshapex_, ishapex_; +}; +// special work plan for 1 dimensional data +template +struct Plan, DType> { + public: + explicit Plan(const ReshapeExp &e) + : src_(MakePlan(e.src_)), oshapex_(e.shape_[dimdst - 1]) { + } + MSHADOW_XINLINE DType Eval(index_t y, index_t x) const { + return src_.Eval(0, y * oshapex_ + x); + } + + private: + Plan src_; + const index_t oshapex_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_RESHAPE_H_ diff --git a/mshadow/extension/spatial_pool.h b/mshadow/extension/spatial_pool.h new file mode 100644 index 000000000000..07f8433cca27 --- /dev/null +++ b/mshadow/extension/spatial_pool.h @@ -0,0 +1,146 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file spatial_pool.h + * \brief support for spatial pooling + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_SPATIAL_POOL_H_ +#define MSHADOW_EXTENSION_SPATIAL_POOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief pooling expression, do reduction over local patches of a image + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the content data type + * \tparam srcdim dimension of src + */ +template +struct PoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source operand */ + const SrcExp &src_; + /*! \brief kernel size in height */ + index_t ksize_y_; + /*! \brief kernel size in width */ + index_t ksize_x_; + /*! \brief kernel stride */ + index_t kstride_; + /*! \brief source height shape[1] */ + index_t src_height_; + /*! \brief source width shape[0] */ + index_t src_width_; + /*! \brief constructor */ + PoolingExp(const SrcExp &src, + index_t ksize_y, index_t ksize_x, index_t kstride) + : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { + Shape sshape = ShapeCheck::Check(src_); + utils::Check(sshape[srcdim - 1] >= ksize_x && sshape[srcdim - 2] >= ksize_y, + "PoolingExp: kernel must be smaller than image"); + this->src_height_ = sshape[srcdim - 2]; + this->src_width_ = sshape[srcdim - 1]; + this->shape_ = sshape; + this->shape_[srcdim - 2] = (src_height_ - ksize_y) / kstride + 1; + this->shape_[srcdim - 1] = (src_width_ - ksize_x) / kstride + 1; + } + /*! \brief constructor, specify shape */ + PoolingExp(const SrcExp &src, Shape<2> pshape, + index_t ksize_y, index_t ksize_x, index_t kstride) + : src_(src), ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { + Shape sshape = ShapeCheck::Check(src_); + utils::Check(sshape[srcdim - 1] >= ksize_x && + sshape[srcdim - 2] >= ksize_y, + "PoolingExp: kernel must be smaller than image"); + this->src_height_ = sshape[srcdim - 2]; + this->src_width_ = sshape[srcdim - 1]; + this->shape_ = sshape; + this->shape_[srcdim - 2] = pshape[0]; + this->shape_[srcdim - 1] = pshape[1]; + } +}; +/*! + * \brief pooling subregion results together + * \param src source image, shape: (batch, channel, height, width) + * \param ksize_y kernel size in height + * \param ksize_x kernel size in width + * \param kstride stride for each kernel + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PoolingExp::kDim> +pool(const Exp &src, + index_t ksize_y, index_t ksize_x, index_t kstride) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PoolingExp::kDim> + (src.self(), ksize_y, ksize_x, kstride); +} +/*! + * \brief same as pool, except the output shape is specified by pshape + * \param src source image + * \param pshape ouput shape + * \param ksize_y kernel size in y + * \param ksize_x kernel size in x + * \param kstride stride for each kernel + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline PoolingExp::kDim> +pool(const Exp &src, Shape<2> pshape, + index_t ksize_y, index_t ksize_x, index_t kstride) { + TypeCheckPass::kDim >= 2> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PoolingExp::kDim> + (src.self(), pshape, ksize_y, ksize_x, kstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const PoolingExp &e) + : src_(MakePlan(e.src_)), + ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_), + src_height_(e.src_height_), src_width_(e.src_width_), + new_height_(e.shape_[srcdim - 2]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using namespace std; + const index_t py = i % new_height_; + const index_t y_start = py * kstride_; + const index_t y_end = min(y_start + ksize_y_, src_height_); + const index_t px = j; + const index_t x_start = px * kstride_; + const index_t x_end = min(x_start + ksize_x_, src_width_); + const index_t c = i / new_height_; + + DType res; Reducer::SetInitValue(res); + for (index_t y = y_start; y < y_end; ++y) { + for (index_t x = x_start; x < x_end; ++x) { + Reducer::Reduce(res, src_.Eval(c * src_height_ + y, x)); + } + } + return res; + } + + private: + Plan src_; + const index_t ksize_y_, ksize_x_, kstride_; + const index_t src_height_, src_width_; + const index_t new_height_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_SPATIAL_POOL_H_ diff --git a/mshadow/extension/spatial_unpool.h b/mshadow/extension/spatial_unpool.h new file mode 100644 index 000000000000..848b77bb39fa --- /dev/null +++ b/mshadow/extension/spatial_unpool.h @@ -0,0 +1,130 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file spatial_unpool.h + * \brief support for unpool + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ +#define MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief unpooling expr reverse operation of pooling, used to pass gradient back + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam DType the content data type + * \tparam srcdim dimension of src + */ +template +struct UnPoolingExp: + public MakeTensorExp, + SrcExp, srcdim, DType> { + /*! \brief source input, corresponds to src in pooling */ + const SrcExp &data_src_; + /*! \brief result of pooled data, corresponds to result of pooling */ + const SrcExp &data_pooled_; + /*! \brief gradient data of pooled part, to be propgate down */ + const SrcExp &grad_pooled_; + /*! \brief shape of pooled expression */ + index_t pshape_y_; + /*! \brief shape of pooled expression */ + index_t pshape_x_; + /*! \brief kernel size in height */ + index_t ksize_y_; + /*! \brief kernel size in width */ + index_t ksize_x_; + /*! \brief kernel stride */ + index_t kstride_; + /*! \brief constructor */ + UnPoolingExp(const SrcExp &data_src, + const SrcExp &data_pooled, + const SrcExp &grad_pooled, + index_t ksize_y, index_t ksize_x, index_t kstride) + : data_src_(data_src), data_pooled_(data_pooled), + grad_pooled_(grad_pooled), + ksize_y_(ksize_y), ksize_x_(ksize_x), kstride_(kstride) { + Shape pshape = ShapeCheck::Check(grad_pooled); + utils::Check(pshape == ShapeCheck::Check(data_pooled), + "UnPoolingExp: pooled shape mismatch"); + Shape sshape = ShapeCheck::Check(data_src); + for (int k = 0; k < srcdim - 2; ++k) { + utils::Check(pshape[k] == sshape[k], + "UnPoolingExp: pool and src shape mismatch"); + } + pshape_x_ = pshape[srcdim - 1]; + pshape_y_ = pshape[srcdim - 2]; + this->shape_ = sshape; + } +}; +/*! + * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling, + * same as unpooling, but allows unequal size of kernel + * \param data_src source input, corresponds to src in pooling + * \param data_pooled result of pooled data, corresponds to result of pooling + * \param grad_pooled gradient data of pooled part, to be propgate down + * \param ksize_y kernel height + * \param ksize_x kernel width + * \param kstride stride for each kernel + * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam DType the content data type + * \tparam etype type of expression + */ +template +inline UnPoolingExp::kDim> +unpool(const Exp &data_src, + const Exp &data_pooled, + const Exp &grad_pooled, + index_t ksize_y, index_t ksize_x, index_t kstride) { + return UnPoolingExp::kDim> + (data_src.self(), data_pooled.self(), grad_pooled.self(), + ksize_y, ksize_x, kstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const UnPoolingExp &e) + : data_src_(e.data_src_), data_pooled_(e.data_pooled_), + grad_pooled_(e.grad_pooled_), sshape_y_(e.shape_[srcdim - 2]), + pshape_y_(e.pshape_y_), pshape_x_(e.pshape_x_), + ksize_y_(e.ksize_y_), ksize_x_(e.ksize_x_), kstride_(e.kstride_) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + using namespace std; + const index_t x = j; + const index_t y = i % sshape_y_; + const index_t c = i / sshape_y_; + const DType vsrc = data_src_.Eval(i, j); + const index_t py_min = + y < ksize_y_ ? 0 : (y - ksize_y_ + kstride_) / kstride_; + const index_t px_min = + x < ksize_x_ ? 0 : (x - ksize_x_ + kstride_) / kstride_; + const index_t py_max = min((y + kstride_) / kstride_, pshape_y_); + const index_t px_max = min((x + kstride_) / kstride_, pshape_x_); + + DType val = static_cast(0); + for (index_t py = py_min; py < py_max; ++py) { + for (index_t px = px_min; px < px_max; ++px) { + val += Reducer::PartialGrad(vsrc, + data_pooled_.Eval(c * pshape_y_ + py, px)) * + grad_pooled_.Eval(c * pshape_y_ + py, px); + } + } + + return val; + } + + private: + Plan data_src_, data_pooled_, grad_pooled_; + const index_t sshape_y_, pshape_y_, pshape_x_; + const index_t ksize_y_, ksize_x_; + const index_t kstride_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_SPATIAL_UNPOOL_H_ diff --git a/mshadow/extension/swapaxis.h b/mshadow/extension/swapaxis.h new file mode 100644 index 000000000000..3fcda22b527e --- /dev/null +++ b/mshadow/extension/swapaxis.h @@ -0,0 +1,109 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file swapaxis.h + * \brief support for swapaxis + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_SWAPAXIS_H_ +#define MSHADOW_EXTENSION_SWAPAXIS_H_ +#include +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief swap two axis of a tensor + * input: Tensor: ishape + * output: Tensor oshape[a1],oshape[a2] = ishape[a2],oshape[a1] + * + * \tparam SrcExp type of source expression + * \tparam DType the type of elements + * \tparam dimsrc source dimension, assert a1 > a2 + * \tparam m_a1 one dimension to be swapped, encoded by dimsrc - a1 + * \tparam a2 second dimension to be swapped, encoded by a2 + */ +template +struct SwapAxisExp: + public MakeTensorExp, + SrcExp, dimsrc, DType> { + // decode the a1, a2 + static const int a1 = dimsrc - m_a1; + /*! \brief source expression */ + const SrcExp &src_; + /*! \brief constructor */ + explicit SwapAxisExp(const SrcExp &src) : src_(src) { + this->shape_ = ShapeCheck::Check(src); + std::swap(this->shape_[a1], this->shape_[a2]); + } +}; +/*! + * \brief a expression that reshapes a tensor to another shape + * \param src Tensor: + * \return a expresion with type Tensor + * \tparam a1 higher dimension to be swapped, assert a1 > a2 + * \tparam a2 lower dimension to be swapped + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype source expression type + */ +template +inline SwapAxisExp::kDim, + ExpInfo::kDim - a1, a2> +swapaxis(const Exp &src) { + typedef ExpInfo Info; + TypeCheckPass= a1 + 1 && Info::kDim >= a2 + 1 && + a2 < a1>::Error_Expression_Does_Not_Meet_Dimension_Req(); + return SwapAxisExp::kDim, + ExpInfo::kDim - a1, a2>(src.self()); +} +template +struct Plan, DType> { + public: + // decode the a1 + static const int a1 = dimsrc - m_a1; + explicit Plan(const SwapAxisExp &e) + : src_(MakePlan(e.src_)), + shapey_(e.shape_.ProdShape(a1 + 1, dimsrc - 1)), + shapez_(e.shape_[a1]), + shapec_(e.shape_.ProdShape(a2 + 1, a1)), + shapen_(e.shape_[a2]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t y = i % shapey_; + i /= shapey_; + const index_t z = i % shapez_; + i /= shapez_; + const index_t c = i % shapec_; + i /= shapec_; + const index_t n = i % shapen_; + // swap z and n + return src_.Eval(((((i / shapen_) * shapez_ + z) * shapec_ + + c) * shapen_ + n) * shapey_ + y, j); + } + + private: + Plan src_; + const index_t shapey_, shapez_, shapec_, shapen_; +}; +template +struct Plan, DType> { + public: + explicit Plan(const SwapAxisExp &e) + : src_(MakePlan(e.src_)), + shapex_(e.shape_[dimsrc - 1]), + shapey_(e.shape_.ProdShape(a2 + 1, dimsrc - 1)), + shapez_(e.shape_[a2]) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t x) const { + // swap x and z + const index_t y = i % shapey_; + i /= shapey_; + const index_t z = i % shapez_; + const index_t n = i / shapez_; + return src_.Eval((n * shapex_ + x) * shapey_ + y , z); + } + + private: + Plan src_; + const index_t shapex_, shapey_, shapez_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_SWAPAXIS_H_ diff --git a/mshadow/extension/unpack_patch2col.h b/mshadow/extension/unpack_patch2col.h new file mode 100644 index 000000000000..619baf26bd2a --- /dev/null +++ b/mshadow/extension/unpack_patch2col.h @@ -0,0 +1,123 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file unpack_patch2col.h + * \brief support for unpack + * \author Tianqi Chen + */ +#ifndef MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_ +#define MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_ +#include "../extension.h" +namespace mshadow { +namespace expr { +/*! + * \brief unpack local (overlap) patches of image to column of mat, + * can be used to implement convolution, this expression allow unpack of a batch + * this is a version support unpacking multiple images + * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: + * \tparam SrcExp source expression + * \tparam dstdim destination dimension + */ +template +struct UnpackPatchToColXExp: + public MakeTensorExp, + SrcExp, 2, DType>{ + /*! \brief source operand */ + const SrcExp &img_; + /*! \brief patch height */ + index_t psize_y_; + /*! \brief patch width */ + index_t psize_x_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief number of input channel */ + index_t i_channel_; + /*! \brief height of img */ + index_t i_height_; + /*! \brief width of img */ + index_t i_width_; + /*! \brief constructor */ + UnpackPatchToColXExp(const SrcExp &img, + index_t psize_y, + index_t psize_x, + index_t pstride) + : img_(img), psize_y_(psize_y), + psize_x_(psize_x), pstride_(pstride) { + Shape imshape = ShapeCheck::Check(img_); + utils::Check(imshape[srcdim - 1] >= psize_x && + imshape[srcdim - 2] >= psize_y, + "UnpackPatchToCol:image shape smaller than patch size"); + this->i_channel_ = imshape[srcdim - 3]; + this->i_height_ = imshape[srcdim - 2]; + this->i_width_ = imshape[srcdim - 1]; + // calculate number of batches + const index_t num = imshape.ProdShape(0, srcdim - 3); + const index_t o_height = (i_height_ - psize_y) / pstride + 1; + const index_t o_width = (i_width_ - psize_x) / pstride + 1; + this->shape_[1] = o_height * o_width * num; + this->shape_[0] = psize_y * psize_x * i_channel_; + } +}; + +/*! + * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution + * after getting unpacked mat, we can use: output = dot(weight, mat) to get covolved results, the relations: + * + * weight; shape[0]: out_channel, shape[1]: ichannel * psize_y * psize_x + * output; shape[0]: out_channel, shape[1]: out_height * out_width * num_of_images + * out_height = (in_height - psize_y) / pstride + 1, this means we pad inperfect patch with 0 + * out_width = (in_width - psize_x) / pstride + 1 + * + * \return mat target matrix; shape[0]: in_channel*psize_y*psize_x shape[1]: out_height*out_width * num_of_images + * \param img source image; shape[-3]: in_channels, shape[-2]: in_height, shape[-1]: in_width, can be 3D or 4D tensor(multiple images) + * \param psize_y height of each patch + * \param psize_x width of each patch + * \param pstride stride of each patch + * \tparam SrcExp source expression + * \tparam DType the type of elements + * \tparam etype type of expression + */ +template +inline UnpackPatchToColXExp::kDim> +unpack_patch2col(const Exp &img, + index_t psize_y, index_t psize_x, index_t pstride) { + TypeCheckPass::kDim >= 3> + ::Error_Expression_Does_Not_Meet_Dimension_Req(); + return UnpackPatchToColXExp::kDim> + (img.self(), psize_y, psize_x, pstride); +} +//---------------------- +// Execution plan +//---------------------- +template +struct Plan, DType> { + public: + explicit Plan(const UnpackPatchToColXExp &e) + :src_(MakePlan(e.img_)), + psize_y_(e.psize_y_), psize_x_(e.psize_x_), pstride_(e.pstride_), + i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), + o_height_((i_height_ - psize_y_) / pstride_ + 1), + o_width_((i_width_ - psize_x_) / pstride_ + 1) {} + MSHADOW_XINLINE DType Eval(index_t i, index_t j) const { + const index_t x_offset = i % psize_x_; + const index_t idivp = i / psize_x_; + const index_t y_offset = idivp % psize_y_; + const index_t c = idivp / psize_y_; + const index_t x = (j % o_width_) * pstride_ + x_offset; + const index_t jdivw = j / o_width_; + const index_t y = (jdivw % o_height_) * pstride_ + y_offset; + const index_t n = jdivw / o_height_; + if (x < i_width_ && y < i_height_) { + return src_.Eval((n * i_channel_ + c) * i_height_ + y, x); + } else { + return 0.0f; + } + } + + private: + Plan src_; + const index_t psize_y_, psize_x_, pstride_, i_channel_; + const index_t i_height_, i_width_, o_height_, o_width_; +}; +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_EXTENSION_UNPACK_PATCH2COL_H_ diff --git a/mshadow/io.h b/mshadow/io.h new file mode 100644 index 000000000000..5a298198123e --- /dev/null +++ b/mshadow/io.h @@ -0,0 +1,122 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file io.h + * \brief definitions of I/O functions for mshadow tensor + * \author Tianqi Chen + */ +#ifndef MSHADOW_IO_H_ +#define MSHADOW_IO_H_ +#include "./tensor.h" + +namespace mshadow { +namespace utils { +/*! + * \brief interface of stream I/O, used to serialize data, + * mshadow does not restricted to only this interface in SaveBinary/LoadBinary + * mshadow accept all class that implements Read and Write + */ +class IStream { + public: + /*! + * \brief read data from stream + * \param ptr pointer to memory buffer + * \param size size of block + * \return usually is the size of data readed + */ + virtual size_t Read(void *ptr, size_t size) = 0; + /*! + * \brief write data to stream + * \param ptr pointer to memory buffer + * \param size size of block + */ + virtual void Write(const void *ptr, size_t size) = 0; + /*! \brief virtual destructor */ + virtual ~IStream(void) {} +}; +} // namespace utils +/*! + * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor storage will be allocated + * \param fo output binary stream + * \param src source data file + * \tparam dim dimension of tensor + * \tparam DType type of element in tensor + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ +template +inline void SaveBinary(TStream &fo, const Tensor &src); +/*! \brief refer to comment of cpu ver \sa SaveBinary */ +template +inline void SaveBinary(TStream &fo, const Tensor &src); +/*! + * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor storage will be allocated + * if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded + * if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst + * \param fi output binary stream + * \param dst destination file + * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen + * \tparam dim dimension of tensor + * \tparam DType type of element in tensor + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ +template +inline void LoadBinary(TStream &fi, + Tensor *dst, bool pre_alloc); +/*! \brief refer to comment of cpu ver \sa LoadBinary */ +template +inline void LoadBinary(TStream &fi, + Tensor *dst, bool pre_alloc); + +// implementations +template +inline void SaveBinary(TStream &fo, const Tensor &src_) { + fo.Write(&src_.shape_, sizeof(src_.shape_)); + Tensor src = src_.FlatTo2D(); + for (index_t i = 0; i < src.size(0); ++i) { + fo.Write(src[i].dptr_, sizeof(DType) * src.size(1)); + } +} +template +inline void SaveBinary(TStream &fo, const Tensor &src) { + // copy to CPU, then save + Tensor tmp(src.shape_); + AllocSpace(&tmp); + Stream stream; + Copy(tmp, src, &stream); + SaveBinary(fo, tmp); + FreeSpace(&tmp); +} +template +inline void LoadBinary(TStream &fi, + Tensor *dst_, bool pre_alloc) { + Shape shape; + utils::Check(fi.Read(&shape, sizeof(shape)) != 0, "mshadow::LoadBinary"); + if (pre_alloc) { + utils::Check(shape == dst_->shape_, + "LoadBinary, shape do not match pre-allocated shape"); + } else { + dst_->shape_ = shape; AllocSpace(dst_); + } + Tensor dst = dst_->FlatTo2D(); + if (dst.size(0) == 0) return; + for (index_t i = 0; i < dst.size(0); ++i) { + utils::Check(fi.Read(dst[i].dptr_, sizeof(DType) * dst.size(1)) != 0, + "mshadow::LoadBinary"); + } +} +template +inline void LoadBinary(TStream &fi, + Tensor *dst, bool pre_alloc) { + Tensor tmp; + LoadBinary(fi, &tmp, false); + if (pre_alloc) { + utils::Check(tmp.shape == dst->shape_, + "LoadBinary, shape do not match pre-allocated shape"); + } else { + dst->shape = tmp.shape; AllocSpace(dst); + } + Stream stream; + Copy(*dst, tmp, &stream); + FreeSpace(&tmp); +} +} // namespace mshadow +#endif // MSHADOW_IO_H_ diff --git a/mshadow/random.h b/mshadow/random.h new file mode 100644 index 000000000000..5213a69571f6 --- /dev/null +++ b/mshadow/random.h @@ -0,0 +1,358 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file random.h + * \brief Random inline functions for tensor. + * \author Bing Xu, Tianqi Chen + * Based on curand|MKL|stdlib + */ +#ifndef MSHADOW_RANDOM_H_ +#define MSHADOW_RANDOM_H_ +#include +#include "./tensor.h" +#include "./tensor_container.h" + +namespace mshadow { +/*! + * \brief random number generator + * \tparam Device the device of random number generator + * \tparam DType the target data type of random number can be float for double + */ +template +class Random {}; + +/*! \brief CPU random number generator */ +template +class Random { + public: + /*! + * \brief constructor of random engine + * \param seed random number seed + */ + explicit Random(int seed) { + this->Seed(seed); + buffer_.Resize(Shape1(kRandBufferSize)); + } + ~Random(void) { +#if MSHADOW_USE_MKL + vslDeleteStream(&vStream_); +#endif + } + /*! + * \brief seed random number generator using this seed + * \param seed seed of prng + */ + inline void Seed(int seed) { +#if MSHADOW_USE_MKL + int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed); + utils::Check(status == VSL_STATUS_OK, + "MKL VSL Random engine failed to be initialized.\n"); +#else + this->rseed_ = static_cast(seed); +#endif + } + /*! + * \brief set the stream of computation + * \param stream computation stream + */ + inline void set_stream(Stream *stream) { + } + /*! + * \brief generate data from uniform [a,b) + * \param dst destination + * \param a lower bound of uniform + * \param b upper bound of uniform + * \tparam dim dimension of tensor + */ + template + inline void SampleUniform(Tensor *dst, + DType a = 0.0f, DType b = 1.0f) { + Tensor mat = dst->FlatTo2D(); + for (index_t i = 0; i < mat.size(0); ++i) { + this->GenUniform(mat[i].dptr_, mat.size(1), a, b); + } + } + /*! + * \brief generate data from standard gaussian + * \param dst destination + * \param mu mean variable + * \param sigma standard deviation + * \tparam dim dimension of tensor + */ + template + inline void SampleGaussian(Tensor *dst, + DType mu = 0.0f, DType sigma = 1.0f) { + if (sigma <= 0.0f) { + *dst = mu; return; + } + Tensor mat = dst->FlatTo2D(); + for (index_t i = 0; i < mat.size(0); ++i) { + this->GenGaussian(mat[i].dptr_, mat.size(1), mu, sigma); + } + } + /*! + * \brief return a temporal expression storing standard gaussian random variables + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \return a temporal expression storing standard gaussian random variables + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + gaussian(Shape shape) { + buffer_.Resize(Shape1(shape.Size())); + this->SampleGaussian(&buffer_, 0.0f, 1.0f); + return expr::reshape(buffer_, shape); + } + /*! + * \brief return a temporal expression storing standard uniform [0,1) + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = uniform(s1) * uniform(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \return a temporal expression storing standard uniform [0,1) + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + uniform(Shape shape) { + buffer_.Resize(Shape1(shape.Size())); + this->SampleUniform(&buffer_, 0.0f, 1.0f); + return expr::reshape(buffer_, shape); + } + + private: +#if MSHADOW_USE_MKL + /*! \brief stream used by MKL VSL */ + VSLStreamStatePtr vStream_; + // generate uniform distribution + inline void GenUniform(float *dptr, index_t size, float a, float b) { + int status = vsRngUniform(0, vStream_, size, dptr, a, b); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } + inline void GenUniform(double *dptr, index_t size, double a, double b) { + int status = vdRngUniform(0, vStream_, size, dptr, a, b); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } + inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) { + int status = vsRngGaussian(0, vStream_, size, dptr, mu, sigma); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } + inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) { + int status = vdRngGaussian(0, vStream_, size, dptr, mu, sigma); + utils::Check(status == VSL_STATUS_OK, + "Failed to generate random number by MKL."); + } +#else + /*! \brief random number seed used by PRNG*/ + unsigned rseed_; + // functions + inline void GenUniform(float *dptr, index_t size, float a, float b) { + for (index_t j = 0; j < size; ++j) { + dptr[j] = static_cast(RandNext()) * (b - a) + a; + } + } + inline void GenUniform(double *dptr, index_t size, double a, double b) { + for (index_t j = 0; j < size; ++j) { + dptr[j] = static_cast(RandNext()) * (b - a) + a; + } + } + inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) { + this->GenGaussianX(dptr, size, mu, sigma); + } + inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) { + this->GenGaussianX(dptr, size, mu, sigma); + } + inline void GenGaussianX(DType *dptr, index_t size, DType mu, DType sigma) { + DType g1 = 0.0f, g2 = 0.0f; + for (index_t j = 0; j < size; ++j) { + if ((j & 1) == 0) { + this->SampleNormal2D(&g1, &g2); + dptr[j] = mu + g1 * sigma; + } else { + dptr[j] = mu + g2 * sigma; + } + } + } + /*! \brief get next random number from rand */ + inline DType RandNext(void) { + return static_cast(rand_r(&rseed_)) / + (static_cast(RAND_MAX) + 1.0f); + } + /*! \brief return a real numer uniform in (0,1) */ + inline DType RandNext2(void) { + return (static_cast(rand_r(&rseed_)) + 1.0f) / + (static_cast(RAND_MAX) + 2.0f); + } + /*! + * \brief sample iid xx,yy ~N(0,1) + * \param xx first gaussian output + * \param yy second gaussian output + */ + inline void SampleNormal2D(DType *xx_, DType *yy_) { + DType &xx = *xx_, &yy = *yy_; + DType x, y, s; + do { + x = 2.0f * RandNext2() - 1.0f; + y = 2.0f * RandNext2() - 1.0f; + s = x * x + y * y; + } while (s >= 1.0f || s == 0.0f); + DType t = std::sqrt(-2.0f * std::log(s) / s); + xx = x * t; yy = y * t; + } +#endif + /*! \brief temporal space used to store random numbers */ + TensorContainer buffer_; +}; // class Random +// only allow GPU PRNG in CUDACC +#ifdef __CUDACC__ +/*! \brief GPU random number generator */ +template +class Random { + public: + /*! + * \brief constructor of random engine + * \param seed random number seed + */ + Random(int seed) { + curandStatus_t status; + status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT); + utils::Check(status == CURAND_STATUS_SUCCESS, + "Can not create CURAND Generator"); + this->Seed(seed); + buffer_.Resize(Shape1(kRandBufferSize)); + } + + ~Random(void) { + curandStatus_t status; + status = curandDestroyGenerator(gen_); + utils::Check(status == CURAND_STATUS_SUCCESS, + "Destory CURAND Gen failed"); + } + /*! + * \brief set the stream of computation + * \param stream computation stream + */ + inline void set_stream(Stream *stream) { + curandStatus_t status; + status = curandSetStream(gen_, Stream::GetStream(stream)); + utils::Check(status == CURAND_STATUS_SUCCESS, + "set_stream CURAND failed"); + } + /*! + * \brief seed random number generator using this seed + * \param seed seed of prng + */ + inline void Seed(int seed) { + curandStatus_t status; + status = curandSetPseudoRandomGeneratorSeed(gen_, seed); + utils::Check(status == CURAND_STATUS_SUCCESS, + "Set CURAND seed failed."); + } + /*! + * \brief generate data from uniform [a,b) + * \param dst destination + * \param a lower bound of uniform + * \param b upper bound of uniform + * \tparam dim dimension of tensor + */ + template + inline void SampleUniform(Tensor *dst, + DType a = 0.0f, DType b = 1.0f) { + if (a == 0.0f && b == 1.0f) { + *dst = this->uniform(dst->shape_); + } else { + *dst = this->uniform(dst->shape_) * (b - a) + a; + } + } + /*! + * \brief generate data from standard gaussian + * \param dst destination + * \param mu mean variable + * \param sigma standard deviation + * \tparam dim dimension of tensor + */ + template + inline void SampleGaussian(Tensor *dst, + DType mu = 0.0f, DType sigma = 1.0f) { + *dst = this->gaussian(dst->shape_, mu, sigma); + } + /*! + * \brief return a temporal expression storing standard gaussian random variables + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \param mu mean + * \param sigma variance + * \return a temporal expression storing standard gaussian random variables + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + gaussian(Shape shape, DType mu = 0.0f, DType sigma = 1.0f) { + size_t aligned_sz = ((shape.Size() + 1UL) >> 1) << 1; + // allocate alligned size + buffer_.Resize(Shape1(aligned_sz)); + buffer_.Resize(Shape1(shape.Size())); + this->GenGaussian(buffer_.dptr_, aligned_sz, mu, sigma); + return expr::reshape(buffer_, shape); + } + /*! + * \brief return a temporal expression storing standard uniform [0,1) + * the temporal tensor is only valid before next call of gaussian or uniform + * can be used as part of expression + * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, + * since second call of gaussian(s2) makes gaussian(s1) invalid + * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression + * \param shape shape of the tensor + * \return a temporal expression storing standard uniform [0,1) + * \tparam dim dimension of tensor + */ + template + inline expr::ReshapeExp, DType, dim, 1> + uniform(Shape shape) { + buffer_.Resize(Shape1(shape.Size())); + this->GenUniform(buffer_.dptr_, buffer_.size(0)); + return expr::reshape(buffer_, shape); + } + + private: + inline void GenGaussian(float *dptr, size_t size, float mu, float sigma) { + curandStatus_t status; + status = curandGenerateNormal(gen_, dptr, size, mu, sigma); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + inline void GenGaussian(double *dptr, size_t size, double mu, double sigma) { + curandStatus_t status; + status = curandGenerateNormalDouble(gen_, dptr, size, mu, sigma); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + inline void GenUniform(float *dptr, size_t size) { + curandStatus_t status; + status = curandGenerateUniform(gen_, dptr, size); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + inline void GenUniform(double *dptr, size_t size) { + curandStatus_t status; + status = curandGenerateUniformDouble(gen_, dptr, size); + utils::Check(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed"); + } + /*! \brief random numbeer generator */ + curandGenerator_t gen_; + /*! \brief templ buffer */ + TensorContainer buffer_; +}; // class Random +#endif +} // namespace mshadow +#endif // MSHADOW_RANDOM_H_ diff --git a/mshadow/sse-inl.h b/mshadow/sse-inl.h new file mode 100644 index 000000000000..9281c2a7d487 --- /dev/null +++ b/mshadow/sse-inl.h @@ -0,0 +1,435 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file sse-inl.h + * \brief support of sse2 optimization of some operations + * \author Tianqi Chen + */ +#ifndef MSHADOW_SSE_INL_H_ +#define MSHADOW_SSE_INL_H_ +#ifdef __APPLE__ +#include +#else +#include +#endif +#include "./expression.h" +#include "./tensor.h" + +namespace mshadow { +/*! \brief namespace to support sse2 vectorization */ +namespace sse2 { +/*! + * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells + * \param out_pitch output parameter, the actuall space allocated for each line + * \param lspace number of cells required for each line + * \param num_line number of lines to be allocated + */ +inline void* AlignedMallocPitch(size_t *out_pitch, + size_t lspace, size_t num_line) { + size_t pitch = ((lspace+15) >> 4) << 4; + *out_pitch = pitch; +#ifdef _MSC_VER + void *res = _aligned_malloc(pitch * num_line, 16); +#else +#ifdef __APPLE__ + void *res = malloc(pitch * num_line); +#else + void *res = memalign(16, pitch * num_line); +#endif +#endif + utils::Check(res != NULL, "AlignedMallocPitch failed"); + return res; +} +/*! + * \brief free aligned space + * \param ptr pointer to space to be freed + */ +inline void AlignedFree(void *ptr) { +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif +} +/*! \brief check if a pointer is aligned */ +inline bool CheckAlign(size_t pitch) { + return !(pitch & ((1 << 4) - 1)); +} +/*! \brief check if a pointer is aligned */ +inline bool CheckAlign(void *ptr) { + return CheckAlign(reinterpret_cast(ptr)); +} +/*! + * \brief get upper bound of aligned index of size + * \param size size of the array + * \param fsize size of float + */ +inline index_t UpperAlign(index_t size, size_t fsize) { + return (((size * fsize + 15) >> 4) << 4) / fsize; +} +/*! + * \brief get lower bound of aligned index of size + * \param size size of the array + * \param fsize size of float + */ +inline index_t LowerAlign(index_t size, size_t fsize) { + return (((size * fsize) >> 4) << 4) / fsize; +} +} // namespace sse2 +} // namespace mshadow +#if MSHADOW_USE_SSE +// sse types are not compatible with nvcc, only use them in cpu mode +#include + +namespace mshadow { +namespace sse2 { +/*! + * \brief float vector real type, used for vectorization + * \tparam FloatType double or float + */ +template +struct FVec { + // whether the vectorization is enabled + static const bool kEnabled = false; +}; +/*! \brief vector real type for float */ +template<> +struct FVec { + // type + typedef __m128 DType; + // whether the vectorization is enabled + static const bool kEnabled = true; + /*! \brief number of float in vector */ + static const index_t kSize = 4; + /*! \brief data content */ + DType data_; + // functions + /* constructors */ + FVec(void) {} + explicit FVec(DType data) : data_(data) {} + /* set the float */ + explicit FVec(const float &s) { + data_ = _mm_set1_ps(s); + } + /*!\brief load from pointer src */ + explicit FVec(const float *src) { + data_ = _mm_load_ps(src); + } + /*! \brief store data into dst space */ + inline void Store(float *dst) const { + return _mm_store_ps(dst, data_); + } + /*! \brief sum of all content */ + inline float Sum(void) const { + DType ans = _mm_add_ps(data_, _mm_movehl_ps(data_, data_)); + DType rst = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1)); +#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) + return rst.m128_f32[0]; +#else + float rr = _mm_cvtss_f32(rst); + return rr; +#endif + } +}; +/*! \brief vector real type for float */ +template<> +struct FVec { + // data type + typedef __m128d DType; + // whether the vectorization is enabled + static const bool kEnabled = true; + /*! \brief number of float in vector */ + static const index_t kSize = 2; + /*! \brief data content */ + DType data_; + /* constructors */ + FVec(void) {} + explicit FVec(DType data) : data_(data) {} + /* set the float */ + explicit FVec(const double &s) { + data_ = _mm_set1_pd(s); + } + /*!\brief load from pointer src */ + explicit FVec(const double *src) { + data_ = _mm_load_pd(src); + } + /*! \brief store data into dst space */ + inline void Store(double *dst) const { + return _mm_store_pd(dst, data_); + } + /*! \brief sum of all content */ + inline double Sum(void) const { + DType tmp = _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_)); +#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) + return tmp.m128d_f64[0]; +#else + double ans = _mm_cvtsd_f64(tmp); + return ans; +#endif + } +}; +/*! \brief sse2 operator type of certain operator */ +template +struct SSEOp{ + static const bool kEnabled = false; +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_add_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_add_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_sub_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_sub_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_mul_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_mul_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_div_ps(lhs.data_, rhs.data_)); + } + MSHADOW_CINLINE static FVec + Map(const FVec &lhs, const FVec &rhs) { + return FVec(_mm_div_pd(lhs.data_, rhs.data_)); + } +}; +template<> +struct SSEOp { + static const bool kEnabled = true; + MSHADOW_CINLINE static FVec Map(const FVec &src) { + return src; + } + MSHADOW_CINLINE static FVec Map(const FVec &src) { + return src; + } +}; +// savers to do storage +template +struct Saver{ + MSHADOW_CINLINE static void Save(TFloat *dst, const FVec &src) { + FVec lhs(dst); + FVec ans = SSEOp::Map(lhs, src); + ans.Store(dst); + } +}; +template +struct Saver { + MSHADOW_CINLINE static void Save(TFloat *dst, const FVec &src) { + src.Store(dst); + } +}; +} // namespace sse2 +namespace expr { +// same as plan, but use sse2 +template +class SSEPlan { + public: + /*! + * \brief evaluate the expression at index [y][x], x will be aligned to 4 + * to be implemented by SubType + */ + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const; + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const; +}; +template +class SSEPlan, DType> { + public: + explicit SSEPlan(const Tensor &t) + :dptr_(t.dptr_), stride_(t.stride_) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::FVec(&dptr_[y * stride_ + x]); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return dptr_[y * stride_ + x]; + } + + private: + const DType *dptr_; + index_t stride_; +}; +template +class SSEPlan, DType> { + public: + explicit SSEPlan(DType scalar) : scalar_(scalar) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::FVec(scalar_); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return scalar_; + } + + private: + DType scalar_; +}; +template +class SSEPlan, DType> { + public: + SSEPlan(const SSEPlan &lhs, const SSEPlan &rhs) + : lhs_(lhs), rhs_(rhs) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::SSEOp::Map(lhs_.EvalSSE(y, x), rhs_.EvalSSE(y, x)); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); + } + + private: + SSEPlan lhs_; + SSEPlan rhs_; +}; + +template +class SSEPlan, DType> { + public: + SSEPlan(const SSEPlan &src) : src_(src) {} + MSHADOW_CINLINE sse2::FVec EvalSSE(index_t y, index_t x) const { + return sse2::SSEOp::Map(src_.EvalSSE(y, x)); + } + MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { + return OP::Map(src_.Eval(y, x)); + } + + private: + SSEPlan src_; +}; + +template +inline SSEPlan, DType> +MakeSSEPlan(const BinaryMapExp &e); + +template +inline SSEPlan, DType> MakeSSEPlan(const ScalarExp &e) { + return SSEPlan, DType>(e.scalar_); +} +template +inline SSEPlan MakeSSEPlan(const RValueExp &e) { + return SSEPlan(e.self()); +} +template +inline SSEPlan +MakeSSEPlan(const MakeTensorExp &e) { + return SSEPlan(e.real_self()); +} +template +inline SSEPlan, DType> +MakeSSEPlan(const UnaryMapExp &e) { + return SSEPlan, DType>(MakeSSEPlan(e.src_)); +} +template +inline SSEPlan, DType> +MakeSSEPlan(const BinaryMapExp &e) { + return SSEPlan, + DType>(MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_)); +} +/*! + * \brief static check sse enable + * if a expression E can not be evaluated using sse, then kPass = false + * \tparam Device the type of Device + * \tparam dim dimension of the tensor + * \tparam E expression + */ +template +struct SSECheck{ + static const bool kPass = false; +}; +template +struct SSECheck > { + static const bool kPass = sse2::FVec::kEnabled; +}; +template +struct SSECheck > { + static const bool kPass = sse2::FVec::kEnabled; +}; +template +struct SSECheck > { + static const bool kPass = SSECheck::kPass && sse2::SSEOp::kEnabled; +}; +template +struct SSECheck< BinaryMapExp > { + static const bool kPass = SSECheck::kPass && + SSECheck::kPass && sse2::SSEOp::kEnabled; +}; +//------------------------------------------------- +// Check if data is aligned and allow sse operation +//------------------------------------------------- +template +struct SSEAlignCheck { + inline static bool Check(const E &exp) { + return false; + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const ScalarExp &exp) { + return true; + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const Tensor &t) { + return sse2::CheckAlign(t.dptr_) && + sse2::CheckAlign(t.stride_ * sizeof(DType)); + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const UnaryMapExp &t) { + return SSEAlignCheck::Check(t.src_); + } +}; +template +struct SSEAlignCheck > { + inline static bool Check(const BinaryMapExp &t) { + return SSEAlignCheck::Check(t.lhs_) && + SSEAlignCheck::Check(t.rhs_); + } +}; +/*! + * \brief use SSEPlan to compute result + */ +template +inline void MapSSEPlan(Tensor _dst, + const expr::SSEPlan &plan) { + Tensor dst = _dst.FlatTo2D(); + const index_t xlen = sse2::LowerAlign(dst.size(1), sizeof(DType)); + for (index_t y = 0; y < dst.size(0); ++y) { + for (index_t x = 0; x < xlen; x += sse2::FVec::kSize) { + sse2::Saver::Save(&dst[y][x], plan.EvalSSE(y, x)); + } + for (index_t x = xlen; x < dst.size(1); ++x) { + SV::Save(dst[y][x], plan.Eval(y, x)); + } + } +} +} // namespace expr +} // namespace mshadow +#endif // MSHADOW_USE_SSE +#endif // MSHADOW_SSE_INL_H_ diff --git a/mshadow/stream_gpu-inl.h b/mshadow/stream_gpu-inl.h new file mode 100644 index 000000000000..1cd2e971fc52 --- /dev/null +++ b/mshadow/stream_gpu-inl.h @@ -0,0 +1,70 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file stream_gpu-inl.h + * \brief implementation of GPU code + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_STREAM_GPU_INL_H_ +#define MSHADOW_STREAM_GPU_INL_H_ +#include "./base.h" +#include "./tensor.h" +#include "./utils.h" + +namespace mshadow { +#if MSHADOW_USE_CUDA==1 +// Stream alocation +// actual implementation of GPU stream in CUDA +template<> +struct Stream { + /*! \brief cudaStream */ + cudaStream_t stream_; + Stream(void) : stream_(0) {} + /*! + * \brief wait for all the computation associated + * with this stream to complete + */ + inline void Wait(void) { + cudaError_t err = cudaStreamSynchronize(stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } + /*! + * \brief query whether the the stream is idle + * \return true if the stream is idle and all the job have been completed + */ + inline bool CheckIdle(void) { + cudaError_t err = cudaStreamQuery(stream_); + if (err == cudaSuccess) return true; + if (err == cudaErrorNotReady) return false; + utils::Error(cudaGetErrorString(err)); + return false; + } + /*! + * \brief returns actual cudaStream_t given an input GPU stream pointer + * \param stream pointer to GPU stream + */ + inline static cudaStream_t GetStream(Stream *stream) { + if (stream == NULL) { +#if MSHADOW_FORCE_STREAM + utils::Error("Default GPU stream was used when MSHADOW_FORCE_STREAM was on"); +#endif + return 0; + } + else return stream->stream_; + } +}; +template<> +inline Stream *NewStream(void) { + Stream *st = new Stream(); + cudaError_t err = cudaStreamCreate(&st->stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + return st; +} +template<> +inline void DeleteStream(Stream *stream) { + cudaError_t err = cudaStreamDestroy(stream->stream_); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + delete stream; +} +#endif +} +#endif // MSHADOW_STREAM_GPU_INL_H_ diff --git a/mshadow/tensor.h b/mshadow/tensor.h index d3979b7751a8..773094dd4637 100644 --- a/mshadow/tensor.h +++ b/mshadow/tensor.h @@ -1,485 +1,649 @@ -#ifndef MSHADOW_TENSOR_H -#define MSHADOW_TENSOR_H /*! + * Copyright (c) 2014 by Contributors * \file tensor.h * \brief header file of tensor data structure and functions - * covention: this lib requires explicit memory allocation and de-allocation - * all the data structure Tensor, Tensor are like handles(pointers), - * no memory allocation is happening during calculation + * This lib requires explicit memory allocation and de-allocation + * all the data structure Tensor, Tensor are like handles(pointers), + * no memory allocation is happening during calculation + * + * For STL style tensor, see tensor_container.h * \author Bing Xu, Tianqi Chen */ -#include "tensor_base.h" -#include "tensor_expr.h" +#ifndef MSHADOW_TENSOR_H_ +#define MSHADOW_TENSOR_H_ +#include "./base.h" +#include "./expression.h" namespace mshadow { - /*! - * \brief shape of a tensor - * IMPORTANT NOTE: this shape is different from numpy.shape - * shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension - * shape[k] corresponds to k-th dimension of tensor - * \tparam dimension dimension of tensor - */ - template - struct Shape { - public: - /*! \brief maximum dimension of tensor */ - const static int kMaxShape = dimension; - /*! \brief maximum dimension minus 1 */ - const static int kSubShape = dimension - 1; - public: - /*! \brief default constructor, do nothing */ - MSHADOW_XINLINE Shape(void) {} - /*! \brief constuctor */ - MSHADOW_XINLINE Shape( const Shape &s ){ - #pragma unroll - for( int i = 0; i < kMaxShape; ++i ){ - this->shape_[i] = s[i]; - } - this->stride_ = s.stride_; - } - /*! - * \brief get corresponding index - * \param idx dimension index - * \return the corresponding dimension size - */ - MSHADOW_XINLINE index_t& operator[](index_t idx) { - return shape_[ idx ]; - } - /*! - * \brief get corresponding index - * \param idx dimension index - * \return the corresponding dimension size - */ - MSHADOW_XINLINE const index_t& operator[](index_t idx) const { - return shape_[ idx ]; - } - /*! \return whether two shape equals */ - MSHADOW_XINLINE bool operator==(const Shape &s) const { - #pragma unroll - for ( int i = 0; i < kMaxShape; ++i ) { - if (s.shape_[i] != this->shape_[i]) return false; - } - return true; - } - /*! - * flatten the higher dimension to second dimension, return a 2D shape - * \return the flat 2d shape - */ - MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { - Shape<2> s; - s.stride_ = this->stride_; - s.shape_[ 0 ] = this->shape_[ 0 ]; - index_t ymax = 1; - - #pragma unroll - for (int i = 1; i < kMaxShape; ++i) { - ymax *= this->shape_[ i ]; - } - s.shape_[1] = ymax; - return s; - } - /*! \return number of valid elements */ - MSHADOW_XINLINE size_t Size(void) const{ - size_t memsz = this->shape_[ 0 ]; - #pragma unroll - for (int i = 1; i < kMaxShape; ++i) { - memsz *= this->shape_[ i ]; - } - return memsz; - } - /*! \return memory size, including the aligned x dimension */ - MSHADOW_XINLINE size_t MSize(void) const { - size_t memsz = this->stride_; - #pragma unroll - for (int i = 1; i < kMaxShape; ++i) { - memsz *= this->shape_[ i ]; - } - return memsz; - } - /*! - * \return product shape in [dimstart,dimend) - * \param dimstart start dimension - * \param dimend end dimension - */ - MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{ - index_t num = 1; - #pragma unroll - for (int i = dimstart; i < dimend; ++i) { - num *= this->shape_[ i ]; - } - return num; - } - /*! - * \brief get subshape - * \return subshape - */ - MSHADOW_XINLINE Shape SubShape(void) const { - Shape s; - s.stride_ = this->stride_; - // for cuda - #pragma unroll - for (int i = 0; i < kSubShape; ++i) { - s.shape_[ i ] = this->shape_[ i ]; - } - return s; - } - - public: - /*! \brief storing the dimension information */ - index_t shape_[ kMaxShape ]; - /*! - * \brief storing the stride information in x dimension - * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency - */ - index_t stride_; - }; - // useful construction functions to generate shape - /*! - * \brief construct a one dimension shape, stride will equal s0 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){ - Shape<1> s; s[0] = s0; s.stride_ = s0; - return s; +/*! \brief device name CPU */ +struct cpu { + /*! \brief whether this device is CPU or not */ + static const bool kDevCPU = true; + /*! \brief device flag number, identifies this device */ + static const int kDevMask = 1 << 0; +}; +/*! \brief device name CPU */ +struct gpu { + /*! \brief whether this device is CPU or not */ + static const bool kDevCPU = false; + /*! \brief device flag number, identifies this device */ + static const int kDevMask = 1 << 1; +}; +/*! + * \brief shape of a tensor + * IMPORTANT NOTE: this shape is different from numpy.shape + * shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension + * shape[k] corresponds to k-th dimension of tensor + * \tparam dimension dimension of tensor + */ +template +struct Shape { + /*! \brief dimension of current shape */ + static const int kDimension = dimension; + /*! \brief dimension of current shape minus one */ + static const int kSubdim = dimension - 1; + /*! \brief storing the dimension information */ + index_t shape_[kDimension]; + /*! \brief default constructor, do nothing */ + MSHADOW_XINLINE Shape(void) {} + /*! \brief constuctor */ + MSHADOW_XINLINE Shape(const Shape &s) { + #pragma unroll + for (int i = 0; i < kDimension; ++i) { + this->shape_[i] = s[i]; } - /*! - * \brief construct a two dimension shape, stride will equal s0 - * \param s1 size of dimension 1 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){ - Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0; - return s; + } + /*! + * \brief get corresponding index + * \param idx dimension index + * \return the corresponding dimension size + */ + MSHADOW_XINLINE index_t &operator[](index_t idx) { + return shape_[idx]; + } + /*! + * \brief get corresponding index + * \param idx dimension index + * \return the corresponding dimension size + */ + MSHADOW_XINLINE const index_t &operator[](index_t idx) const { + return shape_[idx]; + } + /*! + * \return whether two shape equals + * \param s the shape to compare against + */ + MSHADOW_XINLINE bool operator==(const Shape &s) const { + #pragma unroll + for (int i = 0; i < kDimension; ++i) { + if (s.shape_[i] != this->shape_[i]) return false; } - /*! - * \brief construct a three dimension shape, stride will equal s0 - * \param s2 size of dimension 2 - * \param s1 size of dimension 1 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){ - Shape<3> s; - s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0; - return s; + return true; + } + /*! + * flatten the higher dimension to second dimension, return a 2D shape + * \return the flat 2d shape + */ + MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { + Shape<2> s; + s.shape_[1] = this->shape_[kDimension - 1]; + index_t ymax = 1; + #pragma unroll + for (int i = 0; i < kDimension - 1; ++i) { + ymax *= this->shape_[i]; } - /*! - * \brief construct a four dimension shape, stride will equal s0 - * \param s3 size of dimension 3 - * \param s2 size of dimension 2 - * \param s1 size of dimension 1 - * \param s0 size of dimension 0 - * \return the shape construction - */ - MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){ - Shape<4> s; - s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0; - return s; + s.shape_[0] = ymax; + return s; + } + /*! \return number of valid elements */ + MSHADOW_XINLINE size_t Size(void) const { + size_t size = this->shape_[0]; + #pragma unroll + for (int i = 1; i < kDimension; ++i) { + size *= this->shape_[i]; } -}; // namespace mshadow - -namespace mshadow { - /*! \brief device name CPU */ - struct cpu { - /*! \brief whether this device is CPU or not */ - const static bool kDevCPU = true; - /*! \brief device flag number, identifies this device */ - const static int kDevMask = 1<<0; - }; - /*! \brief device name CPU */ - struct gpu { - /*! \brief whether this device is CPU or not */ - const static bool kDevCPU = false; - /*! \brief device flag number, identifies this device */ - const static int kDevMask = 1<<1; - }; - - // more compact template - /*! - * \brief general tensor - * \tparam Device which device the tensor is on - * \tparam dimension dimension of the tensor - */ - template - struct Tensor: public expr::ContainerExp< Tensor >{ - public: - /*! \brief whether current type lies in cpu */ - const static bool kDevCPU = Device::kDevCPU; - /*! \brief dimension of subtype */ - const static int kSubdim = dimension - 1; - - public: - /*! \brief pointer to the data */ - real_t *dptr; - /*! \brief shape of the tensor */ - Shape shape; - public: - /*! \brief default constructor */ - MSHADOW_XINLINE Tensor(void) {} - /*! \brief constructor from shape */ - MSHADOW_XINLINE Tensor(const Shape &shape): shape(shape) {} - /*! \brief constructor from data pointer and shape */ - MSHADOW_XINLINE Tensor(real_t *dptr, const Shape &shape): dptr((real_t*)dptr), shape(shape) {} - /*! - * \brief return size of i-th dimension, start counting from highest dimension - * This meets the habit of normal usage of size of matrix. Note that mat.shape[0] gives lowest dimension, - * while mat.size(0) returns the highest dimension - * \param the dimension count from the highest dimensin - * \return the size - */ - MSHADOW_XINLINE index_t size(index_t i) const { - return shape[dimension - 1 - i]; - } - /*! - * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together - * \return tensor after flatten - */ - MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(reinterpret_cast \ - (dptr), shape.FlatTo2D()); - } - /*! - * \brief get a element of dimension - 1 - * \param idx index - * \return the result tensor - */ - MSHADOW_XINLINE Tensor operator[](index_t idx) const { - Shape s = shape.SubShape(); - return Tensor(reinterpret_cast \ - (dptr) + s.MSize() * idx, s); - } - /*! - * \brief slice the tensor in highest dimension [begin,end) - * \param begin begin position of slice - * \param end end position of slice - * \return tensor after slice - */ - MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { - Shape s = this->shape; - s[ dimension - 1 ] = end - begin; - return Tensor(reinterpret_cast\ - (dptr) + s.SubShape().MSize() * begin, s); - } - public: - /*!\brief functions to fit expression template */ - inline Tensor& operator=( real_t s ){ - return this->__assign( s ); - } - /*!\brief functions to fit expression template */ - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - /*!\brief functions to fit expression template */ - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - }; - - /* - * respecialized class Tensor1D,thei is due to different implementation in operator[] - */ - template - struct Tensor: public expr::ContainerExp< Tensor >{ - public: - real_t *dptr; - Shape<1> shape; - public: - MSHADOW_XINLINE Tensor(void) {} - MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {} - MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {} - - MSHADOW_XINLINE Tensor FlatTo2D(void) const { - return Tensor(reinterpret_cast \ - (dptr), shape.FlatTo2D()); - } - MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { - Shape<1> s; - s[0] = s.stride_ = end - begin; - return Tensor(reinterpret_cast \ - (dptr) + begin, s); - } - MSHADOW_XINLINE index_t size(index_t i) const { - return shape[0]; - } - MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; } - MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; } - public: - // functions to fit expression template - inline Tensor& operator=( double s ){ - return this->__assign( s ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - }; -}; // namespace mshadow - -// add unroll loops for the shape -namespace mshadow { - // function declarations - /*! - * \brief initialize tensor engine, used to call intialization functions of dependent libs - * this function should be called before all GPU tensor operations, - * for using tensors in CPU, this call is actually not needed - * \param device_id GPU device id to be choosed - */ - inline void InitTensorEngine( int device_id=0 ); - /*! - * \brief Shutdown tensor engine, - * this function should be called after all GPU tensor operations, - * for using tensors in CPU, this call is actually not needed - */ - inline void ShutdownTensorEngine( void ); - - /*! - * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj - * this function is responsible to set the stride_ in each obj.shape - * \tparam dim specify the dim of tensor - * \param obj the tensor object, with shape specified - * \param pad whether padding dimension 0, to make last dimension aligned, - * padding may help improve efficiency of matrix multiplications - * if true, will allocate space with stride_ that may not equals shape[0] - * if false, will allocate continuous space - */ - template - inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); - /*! \brief refer to comment of cpu ver \sa AllocSpace */ - template - inline void AllocSpace(Tensor &obj, bool pad = MSHADOW_ALLOC_PAD); - - /*! - * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL - * \tparam dim specify the dim of tensor - * \param obj the tensor object - */ - template - inline void FreeSpace(Tensor &obj); - /*! \brief refer to comment of cpu ver \sa FreeSpace */ - template - inline void FreeSpace(Tensor &obj); - - /*! - * \brief CPU/GPU: short cut to allocate and initialize a Tensor - * \tparam Device device of tensor - * \tparam dim dimention of tensor - * \param shape: shape of tensor - * \param initv: initialization value - * \param pad : padding option - * \sa AllocSpace - */ - template - inline Tensor NewTensor(const Shape &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD); - - /*! - * \brief copy data from one tensor to another, with same shape - * \tparam dim specify the dim of tensor - * \param dst target tensor - * \param src source tensor - */ - template - inline void Copy(Tensor dst, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa Copy */ - template - inline void Copy(Tensor dst, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa Copy */ - template - inline void Copy(Tensor dst, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa Copy */ - template - inline void Copy(Tensor dst, const Tensor &src ); - - - /*! - * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) ) - * \param dst destination - * \param energy input energy - */ - inline void Softmax( Tensor dst, const Tensor &energy ); - /*! \brief refer to comment of cpu ver \sa Softmax */ - inline void Softmax( Tensor dst, const Tensor &energy ); - -}; // namespace mshadow - - -namespace mshadow{ - // function declarations to support expression, no need to understand them - // these functions do not need to be directly used - - /*! - * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan - * \tparam Saver specify storage method - * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter - * \tparam E specifies the expression type, not need to specify this parameter during usage - * \tparam etype expression type - * \param dst destination - * \param exp expression - * \sa namespace mshadow:sv, mshadow::op, mshadow::expr - */ - template - inline void MapExp(Tensor dst, const expr::Exp &exp ); - /*! \brief refer to comment of cpu ver \sa MapExp */ - template - inline void MapExp(Tensor dst, const expr::Exp &exp ); - - /*! - * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) - * \tparam Saver specify storage method - * \tparam Reducer specify a reducer method - * \tparam E specifies the expression type, not need to specify this parameter during usage - * \tparam etype expression type - * \param dst destination - * \param exp expression - * \param scale scale the result before save - * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr - */ - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - - - /*! - * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) - * \tparam Saver specify storage method - * \tparam Reducer specify a reducer method - * \tparam E specifies the expression type, not need to specify this parameter during usage - * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest - * \tparam etype expression type - * \param dst destination - * \param exp expression - * \param scale scale the result before save - * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr - */ - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale = 1.0f ); - -};// namespace mshadow - -// execution implementation of expression evaluations -#include "tensor_expr_engine-inl.hpp" -// cpu implementation of functions -#include "tensor_cpu-inl.hpp" -// gpu implementation of functions -#include "tensor_gpu-inl.hpp" -// extension of expressions -#include "tensor_expr_ext.h" -// io -#include "tensor_io.h" -// container -#include "tensor_container.h" -// random number generator -#include "tensor_random.h" -#endif // TENSOR_H + return size; + } + /*! + * \return product shape in [dimstart,dimend) + * \param dimstart start dimension + * \param dimend end dimension + */ + MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const { + index_t num = 1; + #pragma unroll + for (int i = dimstart; i < dimend; ++i) { + num *= this->shape_[i]; + } + return num; + } + /*! + * \brief get subshape that takes off largest dimension +v * \return subshape + */ + MSHADOW_XINLINE Shape SubShape(void) const { + Shape s; + // for cuda + #pragma unroll + for (int i = 0; i < kSubdim; ++i) { + s.shape_[i] = this->shape_[i + 1]; + } + return s; + } + /*! + * \brief slice the shape from start to end + * \tparam dimstart start dimension + * \tparam dimend end dimension + * \return the sliced shape + */ + template + MSHADOW_XINLINE Shape Slice(void) const { + Shape s; + #pragma unroll + for (int i = dimstart; i < dimend; ++i) { + s[i - dimstart] = this->shape_[i]; + } + return s; + } +}; // Shape +//------------------------------------------------ +// useful construction functions to generate shape +//------------------------------------------------- +/*! + * \brief construct a one dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<1> Shape1(index_t s0) { + Shape<1> s; s[0] = s0; + return s; +} +/*! + * \brief construct a two dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \param s1 size of dimension 1 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) { + Shape<2> s; s[0] = s0; s[1] = s1; + return s; +} +/*! + * \brief construct a three dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \param s1 size of dimension 1 + * \param s2 size of dimension 2 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) { + Shape<3> s; + s[0] = s0; s[1] = s1; s[2] = s2; + return s; +} +/*! + * \brief construct a four dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \param s1 size of dimension 1 + * \param s2 size of dimension 2 + * \param s3 size of dimension 3 + * \return the shape construction + */ +MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1, + index_t s2, index_t s3) { + Shape<4> s; + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + return s; +} +/*! + * \brief computaion stream structure, used for asynchronize computation + */ +template +struct Stream { + // this is only a dummy implementation for CPU + // for GPU, the actual implementation will be specialized in tensor_gpu-inl.h + /*! + * \brief wait for all the computation associated + * with this stream to complete + */ + inline void Wait(void) {} + /*! + * \brief query whether the the stream is idle + * \return true if the stream is idle and all the job have been completed + */ + inline bool CheckIdle(void) { + return true; + } +}; +/*! + * \brief Tensor RValue, this is the super type of all kinds of possible tensors + * \tparam Container the tensor type + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + * \tparam DType the type of elements in the tensor + */ +template +struct TRValue: public expr::RValueExp { +}; +// more compact template +/*! + * \brief general tensor + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + * \tparam DType the type of elements in the tensor + */ +template +struct Tensor: public TRValue, + Device, dimension, DType> { + public: + //-------------------------------- + // struct memembers + //-------------------------------- + /*! \brief whether current type lies in cpu */ + static const bool kDevCPU = Device::kDevCPU; + /*! \brief dimension of subtype */ + static const int kSubdim = dimension - 1; + //-------------------------------- + // struct memembers + //-------------------------------- + /*! \brief pointer to the data */ + DType *dptr_; + /*! \brief shape of the tensor */ + Shape shape_; + /*! + * \brief storing the stride information in x dimension + * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency + */ + index_t stride_; + /*! + * \brief stream where the computation lies + * stream is a device dependency concept where each computation + */ + Stream *stream_; + //-------------------------------- + // functions + //-------------------------------- + /*! \brief default constructor */ + MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} + /*! \brief constructor from shape */ + MSHADOW_XINLINE Tensor(const Shape &shape) + : shape_(shape), stream_(NULL) {} + /*! \brief constructor from data pointer and shape, without stride */ + MSHADOW_XINLINE Tensor(DType *dptr, const Shape &shape) + : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {} + /*! \brief constructor from data pointer and shape */ + MSHADOW_XINLINE Tensor(DType *dptr, + const Shape &shape, + index_t stride, Stream *stream) + : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {} + /*! + * \brief set the stream to do computation of current tensor + * \param stream the computation stream + */ + inline void set_stream(Stream *stream) { + this->stream_ = stream; + } + /*! + * \return memory cost of the tensor, including the aligned x dimension + * \tparam startdim the starting dimension + */ + template + MSHADOW_XINLINE size_t MemSize(void) const { + size_t memsz = this->stride_; + #pragma unroll + for (int i = startdim; i < kSubdim; ++i) { + memsz *= this->shape_[i]; + } + return memsz; + } + /*! + * \return whether the tensor's memory is continuous + * x dimension same as stride + */ + MSHADOW_XINLINE bool CheckContiguous(void) const { + return this->shape_[dimension - 1] == stride_; + } + /*! + * \return memory cost of the tensor, including the aligned x dimension + */ + MSHADOW_XINLINE size_t MSize(void) const { + return this->MemSize<0>(); + } + /*! + * \brief return size of i-th dimension, start counting from highest dimension + * \param idx the dimension count from the highest dimensin + * \return the size + */ + MSHADOW_XINLINE index_t size(index_t idx) const { + return shape_[idx]; + } + /*! + * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together + * \return tensor after flatten + */ + MSHADOW_XINLINE Tensor FlatTo2D(void) const { + return Tensor(dptr_, shape_.FlatTo2D(), stride_, stream_); + } + /*! + * \brief get a element of dimension - 1 + * \param idx index + * \return the result tensor + */ + MSHADOW_XINLINE Tensor operator[](index_t idx) const { + return Tensor(dptr_ + this->MemSize<1>() * idx, + shape_.SubShape(), stride_, stream_); + } + /*! + * \brief slice the tensor in highest dimension [begin,end) + * \param begin begin position of slice + * \param end end position of slice + * \return tensor after slice + */ + MSHADOW_XINLINE Tensor + Slice(index_t begin, index_t end) const { + Shape s = this->shape_; + s[0] = end - begin; + return Tensor(dptr_ + this->MemSize<1>() * begin, + s, stride_, stream_); + } + /*!\brief implement the assignment of same type */ + inline Tensor & + operator=(const Tensor &exp) { + dptr_ = exp.dptr_; + shape_ = exp.shape_; + stride_ = exp.stride_; + stream_ = exp.stream_; + return *this; + } + /*!\brief functions to fit expression template */ + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + /*!\brief functions to fit expression template */ + inline Tensor &operator=(const DType &exp) { + return this->__assign(exp); + } +}; +/* + * respecialized class Tensor1D, thei is due to different implementation in operator[] + */ +template +struct Tensor: + public TRValue, Device, 1, DType> { + public: + DType *dptr_; + Shape<1> shape_; + index_t stride_; + Stream *stream_; + // constructor + MSHADOW_XINLINE Tensor(void) : stream_(NULL) {} + MSHADOW_XINLINE Tensor(const Shape<1> &shape) + : shape_(shape), stream_(NULL) {} + MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape) + : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {} + MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, + index_t stride, Stream *stream) + : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {} + inline void set_stream(Stream *stream) { + this->stream_ = stream; + } + MSHADOW_XINLINE Tensor FlatTo2D(void) const { + return Tensor(dptr_, shape_.FlatTo2D(), stride_, stream_); + } + MSHADOW_XINLINE Tensor Slice(index_t begin, index_t end) const { + Shape<1> s; + s[0] = end - begin; + return Tensor(dptr_ + begin, s, s[0], stream_); + } + MSHADOW_XINLINE bool CheckContiguous(void) const { + return true; + } + MSHADOW_XINLINE size_t MSize(void) const { + return shape_[0]; + } + MSHADOW_XINLINE index_t size(index_t i) const { + return shape_[0]; + } + MSHADOW_XINLINE DType &operator[](index_t idx) { + return dptr_[idx]; + } + MSHADOW_XINLINE const DType &operator[](index_t idx) const { + return dptr_[idx]; + } + /*!\brief implement the assignment of same type */ + inline Tensor & + operator=(const Tensor &exp) { + dptr_ = exp.dptr_; + shape_ = exp.shape_; + stride_ = exp.stride_; + stream_ = exp.stream_; + return *this; + } + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + inline Tensor &operator=(const DType &exp) { + return this->__assign(exp); + } +}; +//------------------------ +// Function Declarations +//----------------------- +/*! + * \brief initialize tensor engine, used to call intialization functions of dependent libs + * this function should be called before all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + * \param device_id GPU device id to be choosed + * \tparam Device the device type + */ +template +inline void InitTensorEngine(int device_id = 0); +/*! + * \brief Shutdown tensor engine on current device + * this function should be called after all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + * \tparam Device the device type + */ +template +inline void ShutdownTensorEngine(void); +/*! + * \brief set the device of current thread to work on + * \param devid the device id + * \tparam Device the device type + */ +template +inline void SetDevice(int devid); +/*! + * \brief create a new stream from system + * \return a pointer to the created stream + * \tparam Device the device type + */ +template +inline Stream *NewStream(void); +/*! + * \brief delete the computing stream + * \param stream the stream parameter to be deleted + */ +template +inline void DeleteStream(Stream *stream); +/*! + * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj + * this function is responsible to set the stride_ in each obj.shape + * \param obj the tensor object, with shape specified + * \param pad whether padding dimension 0, to make last dimension aligned, + * padding may help improve efficiency of matrix multiplications + * if true, will allocate space with stride_ that may not equals shape[0] + * if false, will allocate continuous space + * \tparam dim specify the dim of tensor + * \tparam DType type of element in tensor + */ +template +inline void AllocSpace(Tensor *obj, + bool pad = MSHADOW_ALLOC_PAD); +/*! \brief refer to comment of cpu ver \sa AllocSpace */ +template +inline void AllocSpace(Tensor *obj, + bool pad = MSHADOW_ALLOC_PAD); +/*! + * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL + * \param obj the tensor object + * \tparam dim specify the dim of tensor + * \tparam DType type of element in tensor + */ +template +inline void FreeSpace(Tensor *obj); +/*! \brief refer to comment of cpu ver \sa FreeSpace */ +template +inline void FreeSpace(Tensor *obj); +/*! + * \brief CPU/GPU: short cut to allocate and initialize a Tensor + * \param shape: shape of tensor + * \param initv: initialization value + * \param pad : padding option + * \tparam Device device of tensor + * \tparam DType type of element in tensor + * \tparam dim dimention of tensor + * \sa AllocSpace + */ +template +inline Tensor NewTensor(const Shape &shape, + DType initv, + bool pad = MSHADOW_ALLOC_PAD); +/*! + * \brief copy data from one tensor to another, with same shape + * \param dst target tensor + * \param src source tensor + * \param stream the stream, when specified, the copy can exhibit asynchronize behavior + * \tparam dim specify the dim of tensor + * \tparam DType type of element in tensor + */ +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream = NULL); +/*! \brief refer to comment of cpu ver \sa Copy */ +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream = NULL); +/*! \brief refer to comment of cpu ver \sa Copy */ +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream = NULL); +/*! \brief refer to comment of cpu ver \sa Copy */ +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream = NULL); +/*! + * \brief CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) + * \param dst destination + * \param energy input energy + */ +template +inline void Softmax(Tensor dst, const Tensor &energy); +/*! \brief refer to comment of cpu ver \sa Softmax */ +template +inline void Softmax(Tensor dst, const Tensor &energy); +// function declarations to support expression, no need to understand them +// these functions do not need to be directly used +/*! + * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan + * \tparam Saver specify storage method + * \tparam R specifies the storage type of the tensor + * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter + * \tparam DType the type of elements in the tensor + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \sa namespace mshadow:sv, mshadow::op, mshadow::expr + */ +template +inline void MapExp(TRValue *dst, + const expr::Exp &exp); +/*! \brief refer to comment of cpu ver \sa MapExp */ +template +inline void MapExp(TRValue *dst, + const expr::Exp &exp); +/*! + * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) + * \tparam Saver specify storage method + * \tparam Reducer specify a reducer method + * \tparam R specifies the storage type of the tensor + * \tparam DType the type of elements in the tensor + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \param scale scale the result before save + * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr + */ +template +inline void MapReduceKeepLowest(TRValue *dst, + const expr::Exp &exp, + DType scale = 1); +/*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ +template +inline void MapReduceKeepLowest(TRValue *dst, + const expr::Exp &exp, + DType scale = 1); +/*! + * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) + * \tparam Saver specify storage method + * \tparam Reducer specify a reducer method + * \tparam R specifies the storage type of the tensor + * \tparam DType the type of elements in the tensor + * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \param scale scale the result before save + * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr + */ +template +inline void MapReduceKeepHighDim(TRValue *dst, + const expr::Exp &exp, + DType scale = 1); +/*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ +template +inline void MapReduceKeepHighDim(TRValue *dst, + const expr::Exp &exp, + DType scale = 1); +} // namespace mshadow +// include headers +#include "./stream_gpu-inl.h" +#include "./expr_engine-inl.h" +#include "./extension.h" +#include "./tensor_cpu-inl.h" +#include "./tensor_gpu-inl.h" +#include "./io.h" +#include "./tensor_container.h" +#include "./random.h" +// add definition of scalar related operators +#ifdef MSAHDOW_SCALAR_ + #error "MSHADOW_SCALAR_ must not be defined" +#endif +// enumerate all the scalar data type we aim to be good at +#define MSHADOW_SCALAR_ float +#include "./expr_scalar-inl.h" +#undef MSHADOW_SCALAR_ +#define MSHADOW_SCALAR_ double +#include "./expr_scalar-inl.h" +#undef MSHADOW_SCALAR_ +#define MSHADOW_SCALAR_ int +#include "./expr_scalar-inl.h" +#undef MSHADOW_SCALAR_ +#endif // MSHADOW_TENSOR_H_ diff --git a/mshadow/tensor_base.h b/mshadow/tensor_base.h deleted file mode 100644 index b251cbadf4fc..000000000000 --- a/mshadow/tensor_base.h +++ /dev/null @@ -1,298 +0,0 @@ -#ifndef MSHADOW_TENSOR_BASE_H -#define MSHADOW_TENSOR_BASE_H -/*! - * \file tensor_base.h - * \brief definitions of base types, macros functions - * - * \author Bing Xu, Tianqi Chen - */ -#include -#include -#include -#include -#include -// macro defintiions - -/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */ -#ifndef MSHADOW_STAND_ALONE - #define MSHADOW_STAND_ALONE 0 -#endif - -/*! \brief whether do padding during allocation */ -#ifndef MSHADOW_ALLOC_PAD - #define MSHADOW_ALLOC_PAD true -#endif - -/*! - * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation - * for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64 - * set it to 0 then we will always allocate padded memory - */ -#ifndef MSHADOW_MIN_PAD_RATIO - #define MSHADOW_MIN_PAD_RATIO 2 -#endif - -#if MSHADOW_STAND_ALONE - #define MSHADOW_USE_CBLAS 0 - #define MSHADOW_USE_MKL 0 - #define MSHADOW_USE_CUDA 0 -#endif - -/*! \brief use CBLAS for CBLAS */ -#ifndef MSHADOW_USE_CBLAS - #define MSHADOW_USE_CBLAS 0 -#endif -/*! \brief use MKL for BLAS */ -#ifndef MSHADOW_USE_MKL - #define MSHADOW_USE_MKL 1 -#endif -/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */ -#ifndef MSHADOW_USE_CUDA - #define MSHADOW_USE_CUDA 1 -#endif -/*! \brief use single precition float */ -#ifndef MSHADOW_SINGLE_PRECISION - #define MSHADOW_SINGLE_PRECISION 1 -#endif -/*! \brief whether use SSE */ -#ifndef MSHADOW_USE_SSE - #define MSHADOW_USE_SSE 1 -#endif -/*! \brief whether use NVML to get dynamic info */ -#ifndef MSHADOW_USE_NVML - #define MSHADOW_USE_NVML 0 -#endif -// SSE is conflict with cudacc -#ifdef __CUDACC__ - #undef MSHADOW_USE_SSE - #define MSHADOW_USE_SSE 0 -#endif - -#if MSHADOW_USE_CBLAS -extern "C"{ - #include -} -#elif MSHADOW_USE_MKL - #include - #include - #include - #include -#endif - -#if MSHADOW_USE_CUDA - #include - #include -#endif - -#if MSHADOW_USE_NVML - #include -#endif -// -------------------------------- -// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code. -#ifdef MSHADOW_XINLINE - #error "MSHADOW_XINLINE must not be defined" -#endif -#ifdef __CUDACC__ - #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__ -#else - #define MSHADOW_XINLINE inline __attribute__((always_inline)) -#endif -/*! \brief cpu force inline */ -#define MSHADOW_CINLINE inline __attribute__((always_inline)) - -#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L - #define MSHADOW_CONSTEXPR constexpr -#else - #define MSHADOW_CONSTEXPR const -#endif - -/*! \brief namespace for mshadow */ -namespace mshadow { - /*! \brief buffer size for each random number generator */ - const unsigned kRandBufferSize = 1000000; - /*! \brief pi */ - const float kPi = 3.1415926f; - -#if MSHADOW_SINGLE_PRECISION - /*! \brief type that will be used for content */ - typedef float real_t; -#else - typedef double real_t; -#endif - /*! \brief type that will be used for index */ - typedef unsigned index_t; -}; // namespace mshadow - -namespace mshadow { - /*! \brief namespace for operators */ - namespace op { - // binary operator - /*! \brief mul operator */ - struct mul{ - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a * b; - } - }; - /*! \brief plus operator */ - struct plus { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a + b; - } - }; - /*! \brief minus operator */ - struct minus { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a - b; - } - }; - /*! \brief divide operator */ - struct div { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return a / b; - } - }; - /*! \brief get rhs */ - struct right { - /*! \brief map a, b to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { - return b; - } - }; - }; // namespace op - - /*! \brief namespace for savers */ - namespace sv { - /*! \brief save to saver: = */ - struct saveto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a = b; - } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f; - /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static real_t kBetaBLAS = 0.0f; - /*! \brief corresponding binary operator type */ - typedef op::right OPType; - }; - /*! \brief save to saver: += */ - struct plusto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a += b; - } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f; - /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static real_t kBetaBLAS = 1.0f; - /*! \brief corresponding binary operator type */ - typedef op::plus OPType; - }; - /*! \brief minus to saver: -= */ - struct minusto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a -= b; - } - /*! \brief helper constant to use BLAS, alpha */ - MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f; - /*! \brief helper constant to use BLAS, beta */ - MSHADOW_CONSTEXPR static real_t kBetaBLAS = 1.0f; - /*! \brief corresponding binary operator type */ - typedef op::minus OPType; - }; - /*! \brief multiply to saver: *= */ - struct multo { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a *= b; - } - /*! \brief corresponding binary operator type */ - typedef op::mul OPType; - }; - /*! \brief divide to saver: /= */ - struct divto { - /*! \brief save b to a using save method */ - MSHADOW_XINLINE static void Save(real_t& a, real_t b) { - a /= b; - } - /*! \brief corresponding binary operator type */ - typedef op::div OPType; - }; - }; // namespace sv - - - namespace op { - // unary operator/ function: example - // these operators can be defined by user, in the same style as binary and unary operator - // to use, simply write F( src ) - /*! \brief identity function that maps a real number to it self */ - struct identity{ - /*! \brief map a to result using defined operation */ - MSHADOW_XINLINE static real_t Map(real_t a) { - return a; - } - }; - }; // namespace op - - /*! \brief namespace for potential reducer operations */ - namespace red { - /*! \brief sum reducer */ - struct sum { - /*! \brief do reduction into dst */ - MSHADOW_XINLINE static void Reduce( volatile real_t& dst, volatile real_t src ) { - dst += src; - } - /*! \brief calculate gradient of redres with respect to redsrc, redres: reduced result, redsrc: one of reduction element */ - MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) { - return 1.0f; - } - /*! \brief an intial value of reducer */ - MSHADOW_CONSTEXPR static real_t kInitV = 0.0f; - }; - /*! \brief maximum reducer */ - struct maximum { - /*! \brief do reduction into dst */ - MSHADOW_XINLINE static void Reduce( volatile real_t& dst, volatile real_t src ) { - using namespace std; - dst = max( dst, src ); - } - /*! \brief calculate gradient of redres with respect to redsrc, redres: reduced result, redsrc: one of reduction element */ - MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) { - return redres == redsrc ? 1.0f: 0.0f; - } - /*! \brief an intial value of reducer */ -#if MSHADOW_SINGLE_PRECISION - MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX; -#else - MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX; -#endif - }; - }; - - /*! \brief namespace for helper utils of the project */ - namespace utils{ - /*! \brief send error message then exit */ - inline void Error( const char *msg ){ - fprintf( stderr, "Error:%s\n",msg ); - exit( -1 ); - } - /*! \brief assert a expression is true */ - inline void Assert( bool exp ){ - if( !exp ) Error( "AssertError" ); - } - /*! \brief assert a expression is true */ - inline void Assert( bool exp, const char *msg ){ - if( !exp ) Error( msg ); - } - /*! \brief warning */ - inline void Warning( const char *msg ){ - fprintf( stderr, "warning:%s\n",msg ); - } - }; // namespace utils -}; // namespace mshadow -#endif // TENSOR_BASE_H diff --git a/mshadow/tensor_container.h b/mshadow/tensor_container.h index f0699e735b0f..dbf250ceed28 100644 --- a/mshadow/tensor_container.h +++ b/mshadow/tensor_container.h @@ -1,152 +1,161 @@ -#ifndef MSHADOW_TENSOR_CONTAINER_H -#define MSHADOW_TENSOR_CONTAINER_H /*! + * Copyright (c) 2014 by Contributors * \file tensor_container.h * \brief tensor container that does memory allocation and resize like STL * \author Tianqi Chen */ -#include "tensor.h" -#include "tensor_io.h" +#ifndef MSHADOW_TENSOR_CONTAINER_H_ +#define MSHADOW_TENSOR_CONTAINER_H_ +#include "./tensor.h" +#include "./io.h" -namespace mshadow{ - /*! - * \brief tensor container that does memory allocation and resize like STL, - * use it to save the lines of FreeSpace in class. - * Do not abuse it, efficiency can come from pre-allocation and no re-allocation - * - * \tparam Device which device the tensor is on - * \tparam dimension dimension of the tensor - */ - template - class TensorContainer: public Tensor{ - public: - /*! - * \brief constructor - * \param pad whether use padding alignment in space allocation - */ - TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){ - this->pad_ = pad; - this->dptr = data_.dptr = NULL; - this->shape[0] = 0; - this->shape.stride_ = 0; - this->data_.shape.stride_ = 0; - this->data_.shape[1] = 0; - } - /*! - * \brief constructor - * \param shape intial shape - */ - TensorContainer( const Shape &shape ){ - this->pad_ = MSHADOW_ALLOC_PAD; - data_.dptr = NULL; - this->AllocByShape( shape ); - } - /*! - * \brief constructor - * \param shape intial shape - * \param initv intial value - */ - TensorContainer( const Shape &shape, real_t initv ){ - this->pad_ = MSHADOW_ALLOC_PAD; - data_.dptr = NULL; - this->AllocByShape( shape ); - (*this) = initv; - } - ~TensorContainer( void ){ - this->FreeSpace(); - } - /*! - * \brief resize the container to given shape, content is NOT preserved - * \param shape target shape - */ - inline void Resize( const Shape &shape ){ - Shape<2> s2 = shape.FlatTo2D(); - if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){ - this->AllocByShape( shape ); - }else{ - this->shape = shape; - if( this->pad_ ){ - this->shape.stride_ = data_.shape.stride_; - }else{ - this->shape.stride_ = this->shape[ 0 ]; - } - } - } - /*! - * \brief resize the container to given shape, and initialize, content is NOT preserved - * \param shape target shape - * \param initv initialization value - */ - inline void Resize( const Shape &shape, real_t initv ){ - this->Resize( shape ); - (*this) = initv; - } - /*! \brief set whether padding is allowed in tensor */ - inline void set_pad( bool pad ){ - this->pad_ = pad; - } - /*! - * \brief save by binary format - * \param fo output binary stream - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void SaveBinary( TStream &fo ) const{ - mshadow::SaveBinary( fo, *this ); - } - /*! - * \brief load by binary format, a temp Tensor storage will be allocated - * \param fi input binary stream - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void LoadBinary( TStream &fi ) { - Tensor tmp; - mshadow::LoadBinary( fi, tmp, false ); - this->Resize( tmp.shape ); - Copy( *this, tmp ); - mshadow::FreeSpace( tmp ); - } - public: - // functions to fit exp template - inline Tensor& operator=( real_t s ){ - return this->__assign( s ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - template - inline Tensor& operator=( const expr::Exp &exp ){ - return this->__assign( exp ); - } - private: - /*! \brief whether we do padding in the space */ - bool pad_; - /*! \brief the shape of data_ is actually current data space */ - Tensor data_; - private: - inline void FreeSpace (void){ - if( data_.dptr != NULL ){ - mshadow::FreeSpace( data_ ); - data_.dptr = this->dptr = NULL; - } - } - inline void AllocByShape (const Shape& shape){ - if( data_.dptr != NULL ){ - this->FreeSpace(); - } - data_.shape = shape.FlatTo2D(); - mshadow::AllocSpace( data_, pad_ ); - this->dptr = data_.dptr; - this->shape = shape; - if( this->pad_ ){ - this->shape.stride_ = data_.shape.stride_; - }else{ - this->shape.stride_ = shape[0]; - } - } - }; -};// namespace mshadow +namespace mshadow { +/*! + * \brief tensor container that does memory allocation and resize like STL, + * use it to save the lines of FreeSpace in class. + * Do not abuse it, efficiency can come from pre-allocation and no re-allocation + * + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + */ +template +class TensorContainer: public Tensor { + public: + /*! + * \brief constructor + * \param pad whether use padding alignment in space allocation + */ + explicit TensorContainer(bool pad = MSHADOW_ALLOC_PAD) { + this->pad_ = pad; + this->dptr_ = data_.dptr_ = NULL; + this->shape_[0] = 0; + this->stride_ = 0; + this->data_.stride_ = 0; + this->data_.shape_[0] = 0; + } + /*! + * \brief constructor + * \param shape intial shape + */ + explicit TensorContainer(const Shape &shape) { + this->pad_ = MSHADOW_ALLOC_PAD; + data_.dptr_ = NULL; + this->AllocByShape(shape); + } + /*! + * \brief constructor + * \param shape intial shape + * \param initv intial value + */ + explicit TensorContainer(const Shape &shape, DType initv) { + this->pad_ = MSHADOW_ALLOC_PAD; + data_.dptr = NULL; + this->AllocByShape(shape); + (*this) = initv; + } + ~TensorContainer(void) { + this->FreeSpace(); + } + /*! + * \brief resize the container to given shape, content is NOT preserved + * \param shape target shape + */ + inline void Resize(const Shape &shape) { + Shape<2> s2 = shape.FlatTo2D(); + if (s2.shape_[1] > data_.stride_ || s2.shape_[0] > data_.size(0)) { + this->AllocByShape(shape); + } else { + this->shape_ = shape; + if (this->pad_) { + this->stride_ = data_.stride_; + } else { + this->stride_ = s2.shape_[1]; + } + } + } + /*! + * \brief resize the container to given shape, and initialize, content is NOT preserved + * \param shape target shape + * \param initv initialization value + */ + inline void Resize(const Shape &shape, DType initv) { + this->Resize(shape); + (*this) = initv; + } + /*! \brief set whether padding is allowed in tensor */ + inline void set_pad(bool pad) { + this->pad_ = pad; + } + /*! + * \brief save by binary format + * \param fo output binary stream + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template + inline void SaveBinary(TStream &fo) const { + mshadow::SaveBinary(fo, *this); + } + /*! + * \brief load by binary format, a temp Tensor storage will be allocated + * \param fi input binary stream + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template + inline void LoadBinary(TStream &fi) { + Tensor tmp; + mshadow::LoadBinary(fi, &tmp, false); + this->Resize(tmp.shape_); + Stream stream; + Copy(*this, tmp, &stream); + mshadow::FreeSpace(&tmp); + } + /*!\brief functions to fit expression template */ + inline Tensor &operator=(DType s) { + return this->__assign(s); + } + /*!\brief functions to fit expression template */ + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + /*!\brief functions to fit expression template */ + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } + /*!\brief functions to fit expression template */ + template + inline Tensor & + operator=(const expr::Exp &exp) { + return this->__assign(exp); + } -#endif + private: + /*! \brief whether we do padding in the space */ + bool pad_; + /*! \brief the shape of data_ is actually current data space */ + Tensor data_; + // freespace + inline void FreeSpace(void) { + if (data_.dptr_ != NULL) { + mshadow::FreeSpace(&data_); + data_.dptr_ = this->dptr_ = NULL; + } + } + inline void AllocByShape(const Shape& shape) { + if (data_.dptr_ != NULL) this->FreeSpace(); + data_.shape_ = shape.FlatTo2D(); + mshadow::AllocSpace(&data_, pad_); + this->dptr_ = data_.dptr_; + this->shape_ = shape; + if (this->pad_) { + this->stride_ = data_.stride_; + } else { + this->stride_ = data_.size(1); + } + } +}; +} // namespace mshadow +#endif // MSHADOW_TENSOR_CONTAINER_H_ diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h new file mode 100644 index 000000000000..240c65faffd6 --- /dev/null +++ b/mshadow/tensor_cpu-inl.h @@ -0,0 +1,273 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file tensor_cpu-inl.h + * \brief implementation of CPU host code + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_TENSOR_CPU_INL_H_ +#define MSHADOW_TENSOR_CPU_INL_H_ +#include +#include "./base.h" +#include "./tensor.h" +#include "./sse-inl.h" + +namespace mshadow { +template<> +inline void InitTensorEngine(int dev_id) { +} +template<> +inline void ShutdownTensorEngine(void) { +} + +template<> +inline void SetDevice(int devid) { +} +template<> +inline Stream *NewStream(void) { + return new Stream(); +} +template<> +inline void DeleteStream(Stream *stream) { + delete stream; +} + +template +inline void *AllocHost_(size_t size); +template +inline void FreeHost_(void * dptr); + +#ifdef __CUDACC__ +template<> +inline void *AllocHost_(size_t size) { + void *dptr; + utils::Check(cudaMallocHost(&dptr, size, + cudaHostAllocPortable) == cudaSuccess, + "AllocHost"); + return dptr; +} +template<> +inline void FreeHost_(void *dptr) { + cudaFreeHost(dptr); +} +#endif + +template<> +inline void *AllocHost_(size_t size) { + size_t pitch; + return sse2::AlignedMallocPitch(&pitch, size, 1); +} +template<> +inline void FreeHost_(void *dptr) { + sse2::AlignedFree(dptr); +} + +template +inline void AllocHost(Tensor *obj) { + obj->stride_ = obj->size(dim - 1); + utils::Assert(obj->CheckContiguous(), "AllocHost"); + void *dptr = AllocHost_(obj->MSize() * sizeof(DType)); + obj->dptr_ = reinterpret_cast(dptr); +} +template +inline void FreeHost(Tensor *obj) { + utils::Assert(obj->dptr_ != NULL, "FreeHost:: double free"); + FreeHost_(obj->dptr_); + obj->dptr_ = NULL; +} + +template +inline void AllocSpace(Tensor *obj, bool pad) { + size_t pitch; + void *dptr; + if (pad) { + dptr = sse2::AlignedMallocPitch + (&pitch, obj->size(dim - 1) * sizeof(DType), obj->shape_.FlatTo2D()[0]); + obj->stride_ = static_cast(pitch / sizeof(DType)); + } else { + obj->stride_ = obj->size(dim - 1); + dptr = sse2::AlignedMallocPitch + (&pitch, obj->shape_.Size() * sizeof(DType), 1); + } + obj->dptr_ = reinterpret_cast(dptr); +} +template +inline Tensor +NewTensor(const Shape &shape, DType initv, bool pad) { + Tensor obj(shape); + AllocSpace(&obj, pad); + MapExp(&obj, expr::ScalarExp(initv)); + return obj; +} +template +inline void FreeSpace(Tensor *obj) { + sse2::AlignedFree(obj->dptr_); + obj->dptr_ = NULL; +} +template +inline void Copy(Tensor _dst, + const Tensor &_src, + Stream *stream) { + utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); + Tensor dst = _dst.FlatTo2D(); + Tensor src = _src.FlatTo2D(); + for (index_t y = 0; y < dst.size(0); ++y) { + memcpy(dst[y].dptr_, src[y].dptr_, sizeof(DType) * dst.size(1)); + } +} +template +inline void MapPlan(TRValue *dst, + const expr::Plan &plan) { + Shape<2> shape = expr::ShapeCheck::Check(dst->self()).FlatTo2D(); + expr::Plan dplan = expr::MakePlan(dst->self()); + for (index_t y = 0; y < shape[0]; ++y) { + for (index_t x = 0; x < shape[1]; ++x) { + // trust your compiler! -_- they will optimize it + Saver::Save(dplan.REval(y, x), plan.Eval(y, x)); + } + } +} +// code to handle SSE optimization +template +struct MapExpCPUEngine { + inline static void Map(TRValue *dst, + const expr::Exp &exp) { + MapPlan(dst, MakePlan(exp.self())); + } +}; + +#if MSHADOW_USE_SSE +template +struct MapExpCPUEngine, + dim, DType, E, etype> { + inline static void Map(Tensor *dst, + const expr::Exp &exp) { + if (expr::SSEAlignCheck::Check(exp.self()) && + expr::SSEAlignCheck >::Check(*dst)) { + expr::MapSSEPlan(dst->self(), MakeSSEPlan(exp.self())); + } else { + MapPlan(dst, MakePlan(exp.self())); + } + } +}; +#endif + +template +inline void MapExp(TRValue *dst, + const expr::Exp &exp) { + expr::TypeCheckPass::kMapPass> + ::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); + Shape eshape = expr::ShapeCheck::Check(exp.self()); + Shape dshape = expr::ShapeCheck::Check(dst->self()); + utils::Check(eshape[0] == 0 || eshape == dshape, + "Assignment: Shape of Tensors are not consistent with target"); +#if MSHADOW_USE_SSE + MapExpCPUEngine::kPass, Saver, R, dim, DType, E, etype> + ::Map(dst->ptrself(), exp); +#else + MapExpCPUEngine::Map(dst, exp); +#endif +} + +template +inline void MapReduceKeepLowest(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + Shape<2> eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()).FlatTo2D(); + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[1] == dshape[0], + "MapReduceKeepLowest::reduction dimension do not match"); + utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); + // execution + expr::Plan dplan = MakePlan(dst->self()); + expr::Plan splan = MakePlan(exp.self()); + for (index_t x = 0; x < eshape[1]; ++x) { + DType res = splan.Eval(0, x); + for (index_t y = 1; y < eshape[0]; ++y) { + Reducer::Reduce(res, splan.Eval(y, x)); + } + Saver::Save(dplan.REval(0, x), res * scale); + } +} + +template +inline void MapReduceKeepHighDim(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + typedef Shape::kDim> EShape; + EShape eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()); + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[dimkeep] == dshape[0], + "MapReduceKeepHighDim::reduction dimension do not match"); + // use equvalent form + Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), + eshape[dimkeep], + eshape.ProdShape(dimkeep + 1, EShape::kSubdim), + eshape[EShape::kSubdim]); + // execution + expr::Plan dplan = MakePlan(dst->self()); + expr::Plan splan = MakePlan(exp.self()); + for (index_t c = 0; c < pshape[1]; ++c) { + DType res; Reducer::SetInitValue(res); + for (index_t n = 0; n < pshape[0]; ++n) { + DType tres; Reducer::SetInitValue(tres); + for (index_t y = 0; y < pshape[2]; ++y) { + for (index_t x = 0; x < pshape[3]; ++x) { + Reducer::Reduce(tres, + splan.Eval((n * pshape[1] + c) * pshape[2] + y, x)); + } + } + Reducer::Reduce(res, tres); + } + Saver::Save(dplan.REval(0, c), res * scale); + } +} + +template +inline void Softmax(Tensor dst, + const Tensor &energy) { + DType mmax = energy[0]; + for (index_t x = 1; x < dst.size(0); ++x) { + if (mmax < energy[x]) mmax = energy[x]; + } + DType sum = 0.0f; + for (index_t x = 0; x < dst.size(0); ++x) { + dst[x] = std::exp(energy[x] - mmax); + sum += dst[x]; + } + for (index_t x = 0; x < dst.size(0); ++x) { + dst[x] /= sum; + } +} +template +inline void Softmax(Tensor dst, + const Tensor &energy) { + utils::Check(dst.shape_ == energy.shape_, "Softmax: shape mismatch"); + for (index_t y = 0; y < dst.size(0); ++y) { + Softmax(dst[y], energy[y]); + } +} + +template +inline DType VDot(const Tensor &lhs, + const Tensor &rhs) { + utils::Check(lhs.shape_ == rhs.shape_, "VDot: shape mismatch"); + DType sum = static_cast(0); + for (index_t x = 0; x < lhs.size(0); ++x) { + sum += lhs[x] * rhs[x]; + } + return sum; +} +} // namespace mshadow +#endif // MSHADOW_TENSOR_CPU_INL_H_ diff --git a/mshadow/tensor_cpu-inl.hpp b/mshadow/tensor_cpu-inl.hpp deleted file mode 100644 index 0fa3cfa50306..000000000000 --- a/mshadow/tensor_cpu-inl.hpp +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef MSHADOW_TENSOR_CPU_INL_HPP -#define MSHADOW_TENSOR_CPU_INL_HPP -/*! - * \file tensor_cpu-inl.hpp - * \brief implementation of CPU host code - * \author Bing Xu, Tianqi Chen - */ -#include -#include "tensor_base.h" -#include "tensor_sse-inl.hpp" - -namespace mshadow { - template - inline void AllocSpace(Tensor &obj, bool pad ){ - size_t pitch; - if( pad ){ - obj.dptr = (real_t*)sse2::AlignedMallocPitch - ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); - obj.shape.stride_ = static_cast( pitch / sizeof(real_t) ); - }else{ - obj.shape.stride_ = obj.shape[0]; - obj.dptr = (real_t*)sse2::AlignedMallocPitch - ( pitch, obj.shape.Size() * sizeof(real_t), 1 ); - } - } - - template - inline Tensor NewTensor(const Shape &shape, real_t initv, bool pad ){ - Tensor obj( shape ); - AllocSpace( obj, pad ); - MapExp( obj, expr::ScalarExp( initv ) ); - return obj; - } - - template - inline void FreeSpace(Tensor &obj){ - sse2::AlignedFree( obj.dptr ); - obj.dptr = NULL; - } - - template - inline void Copy(Tensor _dst, const Tensor &_src ){ - utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); - Tensor dst = _dst.FlatTo2D(); - Tensor src = _src.FlatTo2D(); - for (index_t y = 0; y < dst.shape[1]; ++y ) { - memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] ); - } - } - - template - inline void MapPlan(Tensor _dst, const expr::Plan &plan){ - Tensor dst = _dst.FlatTo2D(); - for (index_t y = 0; y < dst.shape[1]; ++y ) { - for (index_t x = 0; x < dst.shape[0]; ++x ) { - // trust your compiler! -_- they will optimize it - Saver::Save(dst[y][x], plan.Eval( y, x ) ); - } - } - } - - // code to handle SSE optimization - template - struct MapExpCPUEngine; - template - struct MapExpCPUEngine{ - inline static void Map(Tensor dst, const expr::Exp &exp ){ - MapPlan( dst, MakePlan( exp.self() ) ); - } - }; - - #if MSHADOW_USE_SSE - template - struct MapExpCPUEngine{ - inline static void Map(Tensor dst, const expr::Exp &exp ){ - using namespace expr; - if( SSEAlignCheck::Check( exp.self() ) && SSEAlignCheck< dim,Tensor >::Check(dst) ){ - MapSSEPlan( dst, MakeSSEPlan( exp.self() ) ); - }else{ - MapPlan( dst, MakePlan( exp.self() ) ); - } - } - }; - #endif - - template - inline void MapExp(Tensor dst, const expr::Exp &exp ){ - using namespace expr; - TypeCheckPass< TypeCheck::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); - Shape eshape = ShapeCheck::Check( exp.self() ); - utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" ); - #if MSHADOW_USE_SSE - MapExpCPUEngine< SSECheck::kPass,Saver,dim,E,etype >::Map( dst, exp ); - #else - MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp ); - #endif - } - - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - Shape<2> eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ).FlatTo2D(); - - utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" ); - utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" ); - // execution - expr::Plan plan = MakePlan( exp.self() ); - for( index_t x = 0; x < eshape[0]; ++x ){ - real_t res = plan.Eval( 0, x ); - for( index_t y = 1; y < eshape[1]; ++y ){ - Reducer::Reduce( res, plan.Eval( y, x ) ); - } - Saver::Save( dst[x], res*scale ); - } - } - - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - typedef Shape< ExpInfo::kDim > EShape; - EShape eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ); - utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" ); - // use equvalent form - Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], - eshape.ProdShape(1,dimkeep), eshape[0] ); - - // execution - expr::Plan plan = MakePlan( exp.self() ); - - for( index_t c = 0; c < pshape[2]; ++c ){ - real_t res = Reducer::kInitV; - for( index_t n = 0; n < pshape[3]; ++n ){ - real_t tres = Reducer::kInitV; - for( index_t y = 0; y < pshape[1]; ++y ){ - for( index_t x = 0; x < pshape[0]; ++x ){ - Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) ); - } - } - Reducer::Reduce( res, tres ); - } - Saver::Save( dst[c], res*scale ); - } - } - - inline void Softmax( Tensor dst, const Tensor& energy ){ - real_t mmax = energy[0]; - for( real_t x = 1; x < dst.shape[0]; ++x ) - if( mmax < energy[x] ) mmax = energy[x]; - real_t sum = 0.0f; - for( index_t x = 0; x < dst.shape[0]; ++x ){ - dst[x] = std::exp( energy[x] - mmax ); - sum += dst[x]; - } - for( index_t x = 0; x < dst.shape[0]; ++x ){ - dst[x] /= sum; - } - } - inline void Softmax( Tensor dst, const Tensor& energy ){ - utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" ); - for( index_t y = 0; y < dst.shape[1]; ++y ){ - Softmax( dst[y], energy[y] ); - } - } -}; // namespace mshadow - -#endif // TENSOR_CPU_INL_HPP diff --git a/mshadow/tensor_expr.h b/mshadow/tensor_expr.h deleted file mode 100644 index ac8fde79f1c6..000000000000 --- a/mshadow/tensor_expr.h +++ /dev/null @@ -1,367 +0,0 @@ -#ifndef MSHADOW_TENSOR_EXPR_H -#define MSHADOW_TENSOR_EXPR_H -/*! - * \file tensor_expr.h - * \brief definitions of abstract expressions and expressions template - * \author Tianqi Chen, Bing Xu - */ -#include "tensor_base.h" - -namespace mshadow{ - /*! - * \brief namespace for abstract expressions and expressions template, - * have no dependecy on tensor.h, - * These data structure takes no charge in computations, - * they are only used to define operations and represent expression in a symbolic way - */ - namespace expr{ - - /*! \brief type of expressions */ - namespace type{ - /*! \brief this expression directly correspnds to a data class */ - const int kContainer = 0; - /*! \brief this only contains element-wise vector operations */ - const int kMapper = 1; - /*! \brief othercase: e.g dot product */ - const int kComplex = 3; - }; - - /*! - * \brief expression engine that actually interprets these expressions - * this is a function template that needed to be implemented for specific expressions - */ - template - struct ExpEngine{ - template - inline static void Eval( Container& dst, const EType &exp ); - }; - - template - class ContainerExp; - class ScalarExp; - - /*! - * \brief base class for expression - * \tparam SubType inheritated class must put their type into this parameter - * \tparam exp_type expression type, see namespace type - */ - template - struct Exp{ - public: - /*! \return subtype instance of current class */ - inline const SubType& self( void ) const{ - return *static_cast(this); - } - /*! \return reference of subtype instance of current class */ - inline SubType& refself( void ){ - return *static_cast(this); - } - }; - - /*! \brief scalar expression */ - struct ScalarExp: public Exp{ - /*! \brief scalar value */ - real_t scalar_; - /*! \brief constructor */ - ScalarExp( real_t scalar ):scalar_(scalar){} - }; - - /*! \brief represent a transpose expression of a container */ - template - struct TransposeExp: public Exp< TransposeExp, type::kComplex >{ - public: - /*! \brief expression to be transposed */ - const EType &exp; - /*! \brief constructor */ - TransposeExp( const EType &e ):exp(e){} - /*! \brief transpose expression */ - inline const EType & T( void ) const{ - return exp; - } - }; - - /*! - * \brief base class of all variables, that can be assigned to values - * \tparam Container the actually class of data container, e.g. CTensor1D - */ - template - class ContainerExp: public Exp< Container, type::kContainer >{ - public: - /*! - *\brief transpose of a matrix - *\return transpose of current expression - */ - inline const TransposeExp T( void ) const{ - return TransposeExp( this->self() ); - } - public: - /*! \brief operator overload */ - inline Container &operator+=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &operator-=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &operator*=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &operator/=( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - /*! \brief operator overload */ - inline Container &__assign( real_t s ){ - ExpEngine::Eval( this->refself(), ScalarExp(s) ); - return this->refself(); - } - public: - /*! \brief implementation of operator=, note that we can not define container = container */ - template - inline Container &__assign( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator=, note that we can not define container = container */ - template - inline Container &__assign( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator+= */ - template - inline Container &operator+=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator-= */ - template - inline Container &operator-=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator*= */ - template - inline Container &operator*=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - /*! \brief implementation of operator/= */ - template - inline Container &operator/=( const Exp &exp ){ - ExpEngine::Eval( this->refself(), exp.self() ); - return this->refself(); - } - }; - }; // namespace expr - - namespace expr{ - /*! - * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] ) - * \tparam TA type of lhs - * \tparam TB type of rhs - * \tparam ltrans whether lhs is transposed - * \tparam rtrans whether rhs is transposed - */ - template - struct DotExp: public Exp< DotExp, type::kComplex >{ - /*! \brief left operand */ - const TA& lhs_; - /*! \brief right operand */ - const TB& rhs_; - /*! \brief scale over result */ - real_t scale_; - /*! \brief constructor */ - DotExp( const TA &lhs, const TB &rhs, real_t scale ) - :lhs_(lhs),rhs_(rhs),scale_(scale){} - }; - - /*! \brief dot operator def */ - template - inline DotExp dot( const ContainerExp &lhs, const ContainerExp &rhs ){ - return DotExp( lhs.self(), rhs.self(), 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp dot( const TransposeExp &lhs, const ContainerExp &rhs ){ - return DotExp( lhs.exp, rhs.self(), 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp dot( const ContainerExp &lhs, const TransposeExp &rhs ){ - return DotExp( lhs.self(), rhs.exp, 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp dot( const TransposeExp &lhs, const TransposeExp &rhs ){ - return DotExp( lhs.exp, rhs.exp, 1.0f ); - } - /*! \brief dot operator def */ - template - inline DotExp operator*( const DotExp &lhs, real_t rhs ){ - return DotExp( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs ); - } - /*! \brief scale of dot operation */ - template - inline DotExp operator*( real_t lhs, const DotExp &rhs ){ - return DotExp( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs ); - } - }; // namespace expr - - namespace expr{ - /*! - * \brief binary map expression lhs [op] rhs - * \tparam OP operator - * \tparam TA type of lhs - * \tparam TB type of rhs - * \tparam etype expression type, sa namespace::type - */ - template - struct BinaryMapExp: public Exp< BinaryMapExp, etype >{ - /*! \brief left operand */ - const TA& lhs_; - /*! \brief right operand */ - const TB& rhs_; - /*! \brief constructor */ - BinaryMapExp( const TA &lhs, const TB &rhs ) - :lhs_(lhs), rhs_(rhs){} - }; - - /*! \brief make expression */ - template - inline BinaryMapExp MakeExp( const Exp &lhs, const Exp &rhs ){ - return BinaryMapExp( lhs.self(), rhs.self() ); - } - - /*! - * \brief short hand for MakeExp, usage F(lhs, rhs). create a binary operation expression - * \param lhs left operand - * \param rhs right operand - * \tparam binary operator - * \tparam TA lhs expression - * \tparam ta lhs expression type - * \tparam TB rhs expression - * \tparam tb rhs expression type - * \sa mshadow::op - */ - template - inline BinaryMapExp F( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload for const */ - template - inline BinaryMapExp F( const Exp &lhs, const ScalarExp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload for const */ - template - inline BinaryMapExp F( const ScalarExp &lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - - // operator rules - /*! \brief operator overload */ - template - inline BinaryMapExp operator+( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator-( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator*( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator/( const Exp &lhs, const Exp &rhs ){ - return MakeExp( lhs, rhs ); - } - // constant operators - /*! \brief operator overload */ - template - inline BinaryMapExp operator+( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator-( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator*( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator/( const Exp& lhs, const ScalarExp& rhs ){ - return MakeExp( lhs, rhs ); - } - // constant operators 2 - /*! \brief operator overload */ - template - inline BinaryMapExp operator+( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator-( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator*( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - /*! \brief operator overload */ - template - inline BinaryMapExp operator/( const ScalarExp& lhs, const Exp& rhs ){ - return MakeExp( lhs, rhs ); - } - }; - - namespace expr{ - /*! - * \brief unary map expression op(src) - * \tparam OP operator - * \tparam TA type of src - * \tparam etype expression type, sa namespace::type - */ - template - struct UnaryMapExp: public Exp< UnaryMapExp, etype >{ - /*! \brief source expression */ - const TA& src_; - /*! \brief constructor */ - UnaryMapExp( const TA &src ):src_(src){} - }; - - /*! \brief make expression */ - template - inline UnaryMapExp MakeExp( const Exp &src ){ - return UnaryMapExp( src.self() ); - } - - /*! - * \brief short hand for MakeExp, usage F(src), create a unary operation expression - * \param src source expression - * \tparam operator - * \tparam TA source expression - * \tparam ta source expression type - * \sa mshadow::op - */ - template - inline UnaryMapExp F( const Exp &src ){ - return MakeExp(src); - } - }; -}; -#endif diff --git a/mshadow/tensor_expr_engine-inl.hpp b/mshadow/tensor_expr_engine-inl.hpp deleted file mode 100644 index 9c5f2c7f7a86..000000000000 --- a/mshadow/tensor_expr_engine-inl.hpp +++ /dev/null @@ -1,416 +0,0 @@ -#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP -#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP -/*! - * \file tensor_expr_engine-inl.hpp - * \brief definitions of how expressions should be evaluated - * \author Tianqi Chen, Bing Xu - */ -#include "tensor_expr.h" -#include "tensor.h" - -namespace mshadow{ - namespace expr{ - /*! - * \brief a general class that allows extension that makes tensors of some shape - * \tparam SubType type of subclass - * \tparam SrcExp source expression of the MakeTensorExp, the source of operation - * \tparam dim dimension of the expression - */ - template - struct MakeTensorExp: public Exp< MakeTensorExp, type::kMapper >{ - /*! \brief the shape of this expression */ - Shape shape_; - /*! \brief true self of subtype */ - inline const SubType& real_self( void ) const{ - return *static_cast(this); - } - }; - }; - - namespace expr{ - /*! \brief This part of code gives plan that can be used to carry out execution */ - template - class Plan{ - public: - /*! - * \brief evaluate the expression at index [y][x] - * to be implemented by SubType - */ - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const; - }; - - template - class Plan< Tensor >{ - public: - Plan( const Tensor &t ) - :dptr_(t.dptr),stride_(t.shape.stride_){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ y * stride_ + x ]; - } - private: - const real_t *dptr_; - index_t stride_; - }; - // special evaluation case for 1d tensor - template - class Plan< Tensor >{ - public: - Plan( const Tensor &t ):dptr_(t.dptr){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ x ]; - } - private: - const real_t *dptr_; - }; - - template<> - class Plan{ - public: - Plan( real_t scalar ):scalar_(scalar){} - /*! \brief evaluate at [y][x] */ - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return scalar_; - } - private: - real_t scalar_; - }; - - template - class Plan< BinaryMapExp >{ - public: - Plan( const Plan &lhs, const Plan &rhs ) - :lhs_(lhs), rhs_(rhs){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) ); - } - private: - Plan lhs_; - Plan rhs_; - }; - - template - class Plan< UnaryMapExp >{ - public: - Plan( const Plan &src ):src_(src){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( src_.Eval( y, x ) ); - } - private: - Plan src_; - }; - - - template - struct Plan< MakeTensorExp >{ - public: - Plan( const Plan &src ):src_(src){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return src_.Eval( y, x ); - } - private: - Plan src_; - }; - - // allow UnaryMap see the plan - template - inline Plan< BinaryMapExp > MakePlan( const BinaryMapExp &e ); - - // translate from exp to execution plan - inline Plan MakePlan( const ScalarExp &e ){ - return Plan( e.scalar_ ); - } - - template - inline Plan MakePlan( const ContainerExp &e ){ - return Plan( e.self() ); - } - - template - inline Plan< T > MakePlan( const MakeTensorExp &e ){ - return Plan< T >( e.real_self() ); - } - - template - inline Plan< UnaryMapExp > MakePlan( const UnaryMapExp &e ){ - return Plan< UnaryMapExp >( MakePlan(e.src_) ); - } - - template - inline Plan< BinaryMapExp > MakePlan( const BinaryMapExp &e ){ - return Plan< BinaryMapExp >( MakePlan(e.lhs_), MakePlan(e.rhs_) ); - } - }; // namespace expr - - namespace expr{ - /*! - * \brief static type inference template, - * used to get the dimension of each expression, - * if ExpInfo::kDim == -1, this means here are mismatch in expression - * if ( ExpInfo::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu - * \tparam E expression - */ - template - struct ExpInfo{ - const static int kDim = -1; - const static int kDevMask = 0; - }; - template<> - struct ExpInfo{ - const static int kDim = 0; - const static int kDevMask = 0xffff; - }; - template - struct ExpInfo< Tensor >{ - const static int kDim = dim; - const static int kDevMask = Device::kDevMask; - }; - template - struct ExpInfo< MakeTensorExp >{ - const static int kDimSrc = ExpInfo::kDim; - const static int kDim = kDimSrc >= 0 ? dim : -1; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< UnaryMapExp >{ - const static int kDim = ExpInfo::kDim; - const static int kDevMask = ExpInfo::kDevMask; - }; - template - struct ExpInfo< BinaryMapExp >{ - const static int kDimLhs = ExpInfo::kDim; - const static int kDimRhs = ExpInfo::kDim; - const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ - ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1; - const static int kDevMask = ExpInfo::kDevMask & ExpInfo::kDevMask; - }; - - /*! \brief template to do type check */ - template - struct TypeCheck{ - /*! \brief dimension of expression*/ - const static int kExpDim = ExpInfo::kDim; - /*! \brief whether the expression device type matches */ - const static bool kDevPass = (ExpInfo::kDevMask & Device::kDevMask) != 0; - /*! \brief whether the expression can be mapped to expression of dim */ - const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; - /*! \brief whether the expression can be reduced to expression of dim */ - const static bool kRedPass = (kExpDim > dim) && kDevPass; - }; - - template - struct TypeCheckPass; - template<> - struct TypeCheckPass{}; - template<> - struct TypeCheckPass{ - inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){} - inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){} - inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){} - }; - }; // namespace expr - - namespace expr{ - // check shape consistency - template - struct ShapeCheck{ - inline static Shape Check( const E &t ); - }; - - template - struct ShapeCheck{ - inline static Shape Check( const ScalarExp &exp ){ - // use lowest dimension to mark scalar exp - Shape shape; shape[0] = 0; - return shape; - } - }; - template - struct ShapeCheck >{ - inline static Shape Check( const Tensor &t ){ - return t.shape; - } - }; - template - struct ShapeCheck >{ - inline static Shape Check( const MakeTensorExp &t ){ - return t.shape_; - } - }; - template - struct ShapeCheck< dim,UnaryMapExp >{ - inline static Shape Check( const UnaryMapExp &t ){ - Shape s = ShapeCheck::Check( t.src_ ); - return s; - } - }; - template - struct ShapeCheck< dim, BinaryMapExp >{ - inline static Shape Check( const BinaryMapExp &t ){ - Shape shape1 = ShapeCheck::Check( t.lhs_ ); - Shape shape2 = ShapeCheck::Check( t.rhs_ ); - if( shape1[0] == 0 ) return shape2; - if( shape2[0] == 0 ) return shape1; - utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same"); - return shape1; - } - }; - }; // namespace expr - - // the matrix OP depends on BLAS - namespace expr{ - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ); - }; - - // handles the dot - template - struct BLASEngine; - - #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) - template<> - struct BLASEngine{ - inline static CBLAS_TRANSPOSE GetT( bool t ){ - return t ? CblasTrans : CblasNoTrans; - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \ - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){ - cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \ - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){ - cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY ){ - cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY ){ - cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){ - cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){ - cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); - } - }; - #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL - - #if MSHADOW_USE_CUDA - // All CuBLAS goes to here, use legacy API: not threadsafe - template<> - struct BLASEngine{ - inline static char GetT( bool t ){ - return t ? 'T' : 'N'; - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, - const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){ - cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, - const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){ - cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); - } - inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \ - const float *X, int incX, float beta, float *Y, int incY ){ - cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \ - const double *X, int incX, double beta, double *Y, int incY ){ - cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); - } - inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){ - cublasSger(m,n,alpha,X,incX,Y,incY,A,lda); - } - inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){ - cublasDger(m,n,alpha,X,incX,Y,incY,A,lda); - } - }; - #endif - - // helper function to decide which shape we are in - inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){ - return transpose ? Shape2(shape[0],shape[1]) : shape; - } - // dst = dot( lhs[.T], rhs[.T] ) - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ) { - Shape<2> sleft = GetShape( lhs.shape, transpose_left ); - Shape<2> sright = GetShape( rhs.shape, transpose_right ); - utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \ - && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" ); - // use column major argument to compatible with most BLAS - BLASEngine::gemm - ( transpose_right , transpose_left, - transpose_right ? rhs.shape[1] : rhs.shape[0], - transpose_left ? lhs.shape[0] : lhs.shape[1], - transpose_right ? rhs.shape[0] : rhs.shape[1], - scale * SV::kAlphaBLAS, - rhs.dptr, rhs.shape.stride_, - lhs.dptr, lhs.shape.stride_, - SV::kBetaBLAS, - dst.dptr, dst.shape.stride_ ); - } - }; - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ) { - Shape<2> sright = GetShape( rhs.shape, transpose_right ); - utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch"); - BLASEngine::gemv - ( transpose_right, - rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS, - rhs.dptr, rhs.shape.stride_, - lhs.dptr, 1, SV::kBetaBLAS, - dst.dptr, 1 ); - } - }; - template - struct DotEngine{ - inline static void Eval( Tensor &dst, const Tensor &lhs, const Tensor &rhs, real_t scale ) { - utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" ); - if( SV::kBetaBLAS < 1e-6f ){ - BLASEngine::ger - ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS, - rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ ); - }else{ - DotEngine::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale ); - } - } - }; - - }; // namespace expr - - namespace expr{ - /*! \brief some engine that evaluate complex expression */ - template - struct ExpComplexEngine{ - inline static void Eval( Tensor& dst, const E &exp ); - }; - template - struct ExpEngine >{ - template - inline static void Eval( Tensor& dst, const Exp &exp ){ - MapExp( dst, exp ); - } - template - inline static void Eval( Tensor& dst, const Exp &exp ){ - MapExp( dst, exp ); - } - template - inline static void Eval( Tensor& dst, const Exp &exp ){ - ExpComplexEngine::Eval( dst, exp.self() ); - } - }; - template - struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor, Tensor, ltrans, rtrans > >{ - inline static void Eval( Tensor &dst, const DotExp< Tensor, Tensor, ltrans, rtrans > &exp ){ - DotEngine::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ ); - } - }; - }; // namespace expr -}; -#endif diff --git a/mshadow/tensor_expr_ext.h b/mshadow/tensor_expr_ext.h deleted file mode 100644 index 8399b1b7a26b..000000000000 --- a/mshadow/tensor_expr_ext.h +++ /dev/null @@ -1,978 +0,0 @@ -#ifndef MSHADOW_TENSOR_EXPR_EXT_H -#define MSHADOW_TENSOR_EXPR_EXT_H -/*! - * \file tensor_expr_ext.h - * \brief some extension of expressions, used to support something beyond elementwise op - * \author Tianqi Chen, Bing Xu - */ -#include "tensor_expr_engine-inl.hpp" -namespace mshadow{ - // Declaration of expressions goes here - namespace expr{ - /*! - * \brief broadcast Tensor1D into a higher dimension Tensor - * input: Tensor: ishape[0] - * output: Tensor : oshape[dimcast] = ishape[0] - * \tparam Device which device it lies - * \tparam dimdst target tensor dimension - * \tparam dimcast the dimension where the 1D tensor fills in by index - */ - template - struct Broadcast1DExp: public MakeTensorExp< Broadcast1DExp,Tensor,dimdst>{ - /*! \brief source operand */ - const Tensor src_; - /*! \brief constructor */ - Broadcast1DExp( const Tensor &src, Shape shape ):src_(src){ - this->shape_ = shape; - } - }; - - /*! - * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch - * this is a version support unpacking multiple images - * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: - * \tparam SrcExp source expression - * \tparam dstdim destination dimension - */ - template - struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp, SrcExp, 2>{ - /*! \brief source operand */ - const SrcExp& img_; - /*! \brief patch size */ - index_t psize_; - /*! \brief patch stride */ - index_t pstride_; - /*! \brief number of input channel */ - index_t i_channel_; - /*! \brief height of img */ - index_t i_height_; - /*! \brief width of img */ - index_t i_width_; - /*! \brief constructor */ - UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t pstride ) - :img_(img), psize_(psize), pstride_(pstride){ - Shape imshape = ShapeCheck::Check( img_ ); - utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "UnpackPatchToCol:image shape smaller than patch size"); - this->i_channel_ = imshape[2]; - this->i_height_ = imshape[1]; - this->i_width_ = imshape[0]; - // calculate number of batches - const index_t num = imshape.ProdShape( 3, srcdim ); - const index_t o_height = ( i_height_ - psize ) / pstride + 1; - const index_t o_width = ( i_width_ - psize ) / pstride + 1; - this->shape_[0] = o_height * o_width * num; - this->shape_[1] = psize * psize * imshape[2]; - } - }; - - /*! - * \brief reverse operation of UnpackPatchToCol, used to backprop gradient back - * this is a version supporting multiple images - * \tparam Device which device it lies - * \tparam dstdim destination dimension - */ - template - struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp, Tensor, dstdim>{ - /*! \brief source operand */ - const Tensor& mat_; - /*! \brief patch size */ - index_t psize_; - /*! \brief patch stride */ - index_t pstride_; - /*! \brief constructor */ - PackColToPatchXExp( const Tensor &mat, Shape imshape, index_t psize, index_t pstride ) - :mat_(mat), psize_(psize), pstride_(pstride){ - this->shape_ = imshape; - const index_t o_height = ( imshape[1] - psize ) / pstride + 1; - const index_t o_width = ( imshape[0] - psize ) / pstride + 1; - utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" ); - utils::Assert( mat.shape[1] == psize * psize * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" ); - } - }; - - /*! - * \brief reshape the content to another shape - * input: Tensor: ishape - * output: Tensor ishape.Size() == oshape.Size() - * \tparam SrcExp source expression - * \tparam dimdst target dimension - * \tparam dimsrc source dimension - */ - template - struct ReshapeExp: public MakeTensorExp< ReshapeExp, SrcExp, dimdst>{ - /*! \brief source expression */ - const SrcExp& src_; - /*! \brief smallest dimension of input */ - index_t ishape0_; - /*! \brief constructor */ - ReshapeExp( const SrcExp &src, Shape shape ):src_(src){ - Shape ishape = ShapeCheck::Check( src_ ); - utils::Assert( ishape.Size() == shape.Size(), "reshape size must match" ); - ishape0_ = ishape[0]; - this->shape_ = shape; - } - }; - - /*! - * \brief swap two axis of a tensor - * input: Tensor: ishape - * output: Tensor oshape[a1],oshape[a2] = ishape[a2],oshape[a1] - * - * \tparam SrcExp type of source expression - * \tparam dimsrc source dimension - * \tparam a1 smaller dimension to be swapped - * \tparam a2 larger dimension to be swapped - */ - template - struct SwapAxisExp: public MakeTensorExp< SwapAxisExp, SrcExp, dimsrc>{ - /*! \brief source expression */ - const SrcExp& src_; - /*! \brief constructor */ - SwapAxisExp( const SrcExp &src ):src_(src){ - this->shape_ = ShapeCheck::Check(src); - std::swap( this->shape_[a1], this->shape_[a2] ); - } - }; - - /*! - * \brief reduction to 1 dimension tensor - * input: Tensor: ishape - * output: Tensor shape[0] = ishape[dimkeep]; - * - * \tparam EType type of expression to be reduced - * \tparam Reducer which reducer to use - * \tparam srcdim dimension of source - * \tparam dimkeep which dimension to be kept, - */ - template - struct ReduceTo1DExp: public Exp< ReduceTo1DExp, type::kComplex >{ - /*! \brief source operand */ - const EType& src_; - /*! \brief source operand, scale of the */ - real_t scale_; - /*! \brief construct a repmat expression from src and nrow */ - ReduceTo1DExp( const EType& src, real_t scale ):src_(src),scale_(scale){} - }; - - /*! - * \brief pooling expression, do reduction over local patches of a image - * \tparam Reducer reduction method during pooling - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct PoolingExp: public MakeTensorExp< PoolingExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief kernel size */ - index_t ksize_; - /*! \brief kernel stride */ - index_t kstride_; - /*! \brief source height shape[1] */ - index_t src_height_; - /*! \brief source width shape[0] */ - index_t src_width_; - /*! \brief constructor */ - PoolingExp( const SrcExp &src, index_t ksize, index_t kstride ) - : src_(src), ksize_(ksize), kstride_(kstride) { - Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); - utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" ); - this->src_height_ = sshape[1]; - this->src_width_ = sshape[0]; - this->shape_ = sshape; - this->shape_[1] = (src_height_ - ksize) / kstride + 1; - this->shape_[0] = (src_width_ - ksize) / kstride + 1; - } - /*! \brief constructor, specify shape */ - PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, index_t kstride ) - : src_(src), ksize_(ksize), kstride_(kstride) { - Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); - utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" ); - this->src_height_ = sshape[1]; - this->src_width_ = sshape[0]; - this->shape_ = sshape; - this->shape_[1] = pshape[1]; - this->shape_[0] = pshape[0]; - } - }; - - /*! - * \brief unpooling expr reverse operation of pooling, used to pass gradient back - * \tparam Reducer specifies reduction operation during pooling - * \tparam Device which device it lies - */ - template - struct UnPoolingExp: public MakeTensorExp< UnPoolingExp, Tensor, 4> { - /*! \brief source input, corresponds to src in pooling */ - const Tensor& data_src_; - /*! \brief result of pooled data, corresponds to result of pooling */ - const Tensor& data_pooled_; - /*! \brief gradient data of pooled part, to be propgate down */ - const Tensor& grad_pooled_; - /*! \brief kernel size */ - index_t ksize_; - /*! \brief kernel stride */ - index_t kstride_; - /*! \brief constructor */ - UnPoolingExp( const Tensor &data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize, index_t kstride ) - : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled), - ksize_(ksize), kstride_(kstride) { - utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" ); - utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" ); - utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" ); - this->shape_ = data_src_.shape; - } - }; - - /*! - * \brief padding expression, pad a image with zeros - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct PaddingExp : public MakeTensorExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief pad size */ - index_t pad_; - /*! \brief source tensor height */ - index_t src_height_; - /*! \brief source tensor width */ - index_t src_width_; - /*! \brief constructor */ - PaddingExp( const SrcExp &src, index_t pad ) - : src_(src), pad_(pad) { - this->shape_ = ShapeCheck::Check( src_ ); - src_height_ = this->shape_[1]; - src_width_ = this->shape_[0]; - this->shape_[1] += pad * 2; // height - this->shape_[0] += pad * 2; // width - } - }; - - /*! - * \brief crop expression, cut off the boundary region, reverse operation of padding - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct CroppingExp : public MakeTensorExp< CroppingExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief pad height */ - index_t pad_height_; - /*! \brief pad height */ - index_t pad_width_; - /*! \brief src height */ - index_t src_height_; - /*! \brief constructor */ - CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) { - this->shape_ = ShapeCheck::Check( src_ ); - utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); - utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); - pad_height_ = (this->shape_[1] - cshape[1]) / 2; - pad_width_ = (this->shape_[0] - cshape[0]) / 2; - src_height_ = this->shape_[1]; - this->shape_[1] = cshape[1]; // width - this->shape_[0] = cshape[0]; // height - } - /*! \brief constructor */ - CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width ) - : src_(src), pad_height_(start_height), pad_width_(start_width) { - this->shape_ = ShapeCheck::Check( src_ ); - utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); - utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); - src_height_ = this->shape_[1]; - this->shape_[1] = cshape[1]; // width - this->shape_[0] = cshape[0]; // height - } - - }; // struct CroppingExp - - - /*! - * \brief mirror expression, mirror a image in width - * \tparam SrcExp source expression to be mirrored - * \tparam srcdim dimension of src - */ - template - struct MirroringExp : public MakeTensorExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief constructor */ - MirroringExp( const SrcExp &src ): src_(src) { - this->shape_ = ShapeCheck::Check( src_ ); - } - }; - - /*! - * \brief channel pooling expression, do reduction over (local nearby) channels, used to implement local response normalization - * \tparam Reducer reduction method during pooling - * \tparam SrcExp source expression to be pooled from - * \tparam srcdim dimension of src - */ - template - struct ChannelPoolingExp: public MakeTensorExp< ChannelPoolingExp, SrcExp, srcdim> { - /*! \brief source operand */ - const SrcExp& src_; - /*! \brief neighbor size */ - index_t nsize_; - /*! \brief constructor */ - ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), nsize_(nsize){ - utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric" ); - this->shape_ = ShapeCheck::Check( src_ ); - utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels" ); - } - }; - }; // namespace expr - - - // Declaration of all functions go here - namespace expr{ - /*! \brief operator overload */ - template - inline ReduceTo1DExp operator*( const ReduceTo1DExp &e, real_t scale ){ - return ReduceTo1DExp( e.src_, e.scale_*scale ); - } - /*! \brief operator overload */ - template - inline ReduceTo1DExp operator*( real_t scale, const ReduceTo1DExp &e ){ - return ReduceTo1DExp( e.src_, e.scale_*scale ); - } - - /*! - * \brief a expression that replicate a 1 dimension tensor in dimension dimcast - * \param src Tensor: shape[0] - * \param shape shape of output - * \return a expresion with type Tensor - * \tparam dimcast target dimension where the 1D tensor will be broadcasted - * \tparam Device which device it lies - * \tparam dimdst dimension of destination tensor - */ - template - inline Broadcast1DExp broadcast( const Tensor &src, Shape shape ){ - TypeCheckPass< dimcast::Error_Expression_Does_Not_Meet_Dimension_Req(); - utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape mismatch" ); - return Broadcast1DExp( src, shape ); - } - - /*! - * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution - * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: - * - * weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize - * output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images - * out_height = ( in_height - psize ) / pstride + 1, this means we pad inperfect patch with 0 - * out_width = ( in_width - psize ) / pstride + 1 - * - * \return mat target matrix; shape[1]: in_channel*psize*psize shape[0]: out_height*out_width * num_of_images - * \param img source image; shape[2]: in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images) - * \param psize height and width of each patch - * \param pstride stride of each patch - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline UnpackPatchToColXExp::kDim > unpack_patch2col( const Exp &img, index_t psize, index_t pstride ){ - TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return UnpackPatchToColXExp::kDim >( img.self(), psize, pstride ); - } - - /*! - * \brief reverse operation of pack_col2patch, can be used to implement deconvolution - * \return packed img expression - * \param mat source matrix - * \param imshape shape of target img - * \param psize height and width of each patch - * \param pstride stride of each patch - * \tparam Device the Device where input data lies - */ - template - inline PackColToPatchXExp pack_col2patch( const Tensor &mat, Shape imshape, index_t psize, index_t pstride ){ - utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size"); - return PackColToPatchXExp( mat, imshape, psize, pstride ); - } - /*! - * \brief a expression that reshapes a tensor to another shape - * \param src Tensor: - * \param oshape target shape - * \return a expresion with type Tensor - * \tparam SrcExp source expression - * \tparam etype source expression type - * \tparam dimdst target dimension - */ - template - inline ReshapeExp< SrcExp,dimdst, ExpInfo::kDim > reshape( const Exp &src, Shape oshape ){ - return ReshapeExp< SrcExp,dimdst, ExpInfo::kDim >( src.self(), oshape ); - } - - /*! - * \brief a expression that reshapes a tensor to another shape - * \param src Tensor: - * \return a expresion with type Tensor - * \tparam a1 smaller dimension to be swapped - * \tparam a2 larger dimension to be swapped - * \tparam SrcExp source expression - * \tparam etype source expression type - */ - template - inline SwapAxisExp< SrcExp, ExpInfo::kDim, a1,a2> swapaxis( const Exp &src ){ - typedef ExpInfo Info; - TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() ); - } - - /*! - * \brief a sum over all dimensions, except dimkeep - * \param exp input expression that must be a matrix Tensor - * \return a expresion with type Tensor - * \tparam dimkeep the dimension that will be kept - * \tparam SrcExp expression - * \tparam etype type of expression - */ - template - inline ReduceTo1DExp sumall_except_dim( const Exp &exp ){ - return ReduceTo1DExp( exp.self(), 1.0f ); - } - - /*! - * \brief pooling subregion results together - * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width - * \param ksize kernel size - * \param kstride stride for each kernel - * \return expression of pooled result - * \tparam Reducer reducer type - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PoolingExp::kDim > pool( const Exp &src, index_t ksize, index_t kstride ) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PoolingExp::kDim >(src.self(), ksize, kstride); - } - /*! - * \brief same as pool, except the output shape is specified by pshape - * \param src source image - * \param pshape ouput shape - * \param ksize kernel size - * \param kstride stride for each kernel - * \return expression of pooled result - * \tparam Reducer reducer type - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PoolingExp::kDim > pool( const Exp &src, Shape<2> pshape, index_t ksize, index_t kstride ) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PoolingExp::kDim >(src.self(), pshape, ksize, kstride); - } - /*! - * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling - * \param data_src source input, corresponds to src in pooling - * \param data_pooled result of pooled data, corresponds to result of pooling - * \param grad_pooled gradient data of pooled part, to be propgate down - * \param ksize kernel size - * \param kstride stride for each kernel - * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient - * \tparam Reducer reducer type - * \tparam Device device where data lies - */ - template - inline UnPoolingExp unpool( const Tensor&data_src, const Tensor &data_pooled, - const Tensor &grad_pooled, index_t ksize, index_t kstride ) { - return UnPoolingExp(data_src, data_pooled, grad_pooled,ksize, kstride); - } - - /*! - * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] - * \param src original image batches - * \param pad padding size - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline PaddingExp::kDim> pad(const Exp &src, index_t pad) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return PaddingExp::kDim>(src.self(), pad); - } - - /*! - * \brief revserse operationg of padding, cut off boundaries, crop output from center of input - * \param src original image batches - * \param oshape output shape to be cropped - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline CroppingExp::kDim> crop( const Exp &src, Shape<2> oshape ) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return CroppingExp::kDim>(src.self(), oshape); - } - /*! - * \brief same as crop, but can specify starting position to do cropping - * \param src original image batches - * \param oshape output shape to be cropped - * \param start_height start height position to do cropping - * \param start_width start width position to do cropping - * \return expression corresponding to padded result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline CroppingExp::kDim> crop( const Exp &src, Shape<2> oshape, index_t start_height, index_t start_width ) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return CroppingExp::kDim>(src.self(), oshape, start_height, start_width); - } - - /*! - * \brief mirroring expression, mirror images in width - * \param src original image batches - * \return expression corresponding to mirrored result - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline MirroringExp::kDim> mirror(const Exp &src) { - TypeCheckPass< ExpInfo::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return MirroringExp::kDim>(src.self()); - } - - /*! - * \brief channel pooling, do reduction over (local nearby) channels, used to implement local response normalization - * \param src source data - * \param nsize neighbor size - * \return expression of pooled result - * \tparam Reducer reducer type - * \tparam SrcExp source expression - * \tparam etype type of expression - */ - template - inline ChannelPoolingExp::kDim > chpool( const Exp &src, index_t nsize ) { - TypeCheckPass< ExpInfo::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - return ChannelPoolingExp::kDim >(src.self(),nsize); - } - // short cut functions - /*! - * \brief a expression that replicate a 1 dimension tensor for nrow times - * \param src Tensor: shape[0] - * \param nrow number of rows to replicate - * \return a expresion with type Tensor shape[0], shape[1] = nrow - * \tparam Device which device it lies - */ - template - inline Broadcast1DExp repmat( const Tensor &src, index_t nrow ){ - return broadcast<0>( src, Shape2( nrow, src.shape[0] ) ); - } - /*! - * \brief a expression that sum over rows of a matrix - * \param exp input expression that must be a matrix Tensor - * \return a expresion with type Tensor - * \tparam SrcExp expression - * \tparam etype type of expression - */ - template - inline ReduceTo1DExp sum_rows( const Exp &exp ){ - return sumall_except_dim<0>( exp ); - } - - }; // namespace expr -}; // namespace mshadow - -// ================================================== -// implementations afterwards, -// no need to read if only use the functions -// -------------------------------------------------- -namespace mshadow{ - namespace expr{ - template - struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp >{ - inline static void Eval( Tensor &dst, const ReduceTo1DExp &exp ){ - TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - MapReduceKeepHighDim( dst, exp.src_, exp.scale_ ); - } - }; - - template - struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp >{ - inline static void Eval( Tensor &dst, const ReduceTo1DExp &exp ){ - MapReduceKeepLowest( dst, exp.src_, exp.scale_ ); - } - }; - }; // namespace expr - - namespace expr{ - /*! \brief execution plan of Broadcast1DExp */ - template - struct Plan< Broadcast1DExp >{ - public: - Plan( const Broadcast1DExp &e ) - : dptr_( e.src_.dptr ), - ystride_( e.shape_.ProdShape(1,dimcast) ), - length_(e.shape_[dimcast]){ - TypeCheckPass< dimcast!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); - } - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ (y / ystride_) % length_ ]; - } - private: - const real_t *dptr_; - const index_t ystride_, length_; - }; - - /*! \brief execution plan of Broadcast1DExp */ - template - struct Plan< Broadcast1DExp >{ - public: - Plan( const Broadcast1DExp &e ): dptr_( e.src_.dptr ){} - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ x ]; - } - private: - const real_t *dptr_; - }; - }; // namespace expr - - namespace expr{ - template - struct Plan< UnpackPatchToColXExp >{ - public: - Plan( const UnpackPatchToColXExp &e ) - :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_), - i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), - o_height_(( i_height_ - psize_ ) / pstride_ + 1), - o_width_ (( i_width_ - psize_ ) / pstride_ + 1){ - } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ - const index_t x_offset = i % psize_; - const index_t idivp = i / psize_; - const index_t y_offset = idivp % psize_; - const index_t c = idivp / psize_; - const index_t x = (j % o_width_) * pstride_ + x_offset; - const index_t jdivw = j / o_width_; - const index_t y = (jdivw % o_height_) * pstride_ + y_offset; - const index_t n = jdivw / o_height_; - - if( x < i_width_ && y < i_height_ ){ - return src_.Eval( ( n * i_channel_ + c ) * i_height_ + y, x ); - }else{ - return 0.0f; - } - } - private: - Plan src_; - const index_t psize_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_; - }; - - template - struct Plan< PackColToPatchXExp >{ - public: - Plan( const PackColToPatchXExp &e ) - :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_), - i_channel_(e.shape_[2]), i_height_(e.shape_[1]), - o_width_(( e.shape_[0] - psize_ ) / pstride_ + 1), - o_height_(( e.shape_[1] - psize_ ) / pstride_ + 1){ - // note: i/o convention are same as unpack - } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ - using namespace std; - const index_t y = i % i_height_; - const index_t idivh = i / i_height_; - const index_t c = idivh % i_channel_; - const index_t n = idivh / i_channel_; - const index_t x = j; - const index_t py_min = y < psize_ ? 0 : (y-psize_+pstride_)/pstride_; - const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_; - const index_t py_max = min( (y+pstride_)/pstride_, o_height_); - const index_t px_max = min( (x+pstride_)/pstride_, o_width_ ); - real_t res = 0.0f; - for( index_t py = py_min; py < py_max; ++py ){ - for( index_t px = px_min; px < px_max; ++px ){ - res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ]; - } - } - return res; - } - private: - Tensor mat_; - const index_t psize_, pstride_, i_channel_, i_height_, o_width_, o_height_; - }; - }; - - namespace expr{ - template - struct Plan< ReshapeExp >{ - public: - Plan( const ReshapeExp &e ) - : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){ - } - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - const index_t idx = y * oshape0_ + x; - return src_.Eval( idx / ishape0_, idx % ishape0_ ); - } - private: - Plan src_; - const index_t oshape0_, ishape0_; - }; - // special work plan for 1 dimensional data - template - struct Plan< ReshapeExp >{ - public: - Plan( const ReshapeExp &e ) - : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){ - } - MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ - return src_.Eval( 0, y * oshape0_ + x ); - } - private: - Plan src_; - const index_t oshape0_; - }; - }; - - namespace expr{ - template - struct Plan< SwapAxisExp >{ - public: - Plan( const SwapAxisExp &e ) - : src_(MakePlan(e.src_)), - shape1_( e.shape_.ProdShape( 1, a1 ) ), - shape2_( e.shape_[a1] ), - shape3_( e.shape_.ProdShape( a1+1, a2 ) ), - shape4_( e.shape_[a2] ){ - } - MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ - const index_t y = i % shape1_; - i /= shape1_; - const index_t z = i % shape2_; - i /= shape2_; - const index_t c = i % shape3_; - i /= shape3_; - const index_t n = i % shape4_; - // swap z and n - return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n ) * shape1_ + y, j ); - } - private: - Plan src_; - const index_t shape1_, shape2_, shape3_, shape4_; - }; - - template - struct Plan< SwapAxisExp >{ - public: - Plan( const SwapAxisExp &e ) - : src_(MakePlan(e.src_)), - shape0_( e.shape_[0] ), - shape1_( e.shape_.ProdShape(1,a2) ), - shape2_( e.shape_[a2] ){ - } - MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{ - // swap x and z - const index_t y = i % shape1_; - i /= shape1_; - const index_t z = i % shape2_; - const index_t n = i / shape2_; - return src_.Eval( ( n*shape0_ + x ) * shape1_ + y , z ); - } - private: - Plan src_; - const index_t shape0_, shape1_, shape2_; - }; - }; - - namespace expr{ - template - struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > { - public: - Plan( const PoolingExp &e ) - : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), kstride_(e.kstride_), - src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) { - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - using namespace std; - const index_t py = i % new_height_; - const index_t y_start = py * kstride_; - const index_t y_end = min( y_start + ksize_, src_height_ ); - const index_t px = j; - const index_t x_start = px * kstride_; - const index_t x_end = min( x_start + ksize_, src_width_ ); - const index_t c = i / new_height_; - - real_t res = Reducer::kInitV; - for (index_t y = y_start; y < y_end; ++y) { - for (index_t x = x_start; x < x_end; ++x) { - Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) ); - } - } - return res; - } - private: - Plan src_; - const index_t ksize_, kstride_; - const index_t src_height_, src_width_; - const index_t new_height_; - }; - - template - struct Plan > { - public: - Plan(const UnPoolingExp &e) - : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_), - ksize_(e.ksize_), kstride_(e.kstride_) {} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - using namespace std; - const index_t x = j; - const index_t y = i % data_src_.shape[1]; - const index_t c = i / data_src_.shape[1]; - const real_t vsrc = data_src_[0][c][y][x]; - - const index_t py_min = y < ksize_ ? 0 : (y-ksize_+kstride_)/kstride_; - const index_t px_min = x < ksize_ ? 0 : (x-ksize_+kstride_)/kstride_; - const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]); - const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]); - - real_t val = 0; - for( index_t py = py_min; py < py_max; ++py ){ - for( index_t px = px_min; px < px_max; ++px ){ - val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px]; - } - } - return val; - } - private: - Tensor data_src_, data_pooled_, grad_pooled_; - const index_t ksize_; - const index_t kstride_; - }; - }; // namespace expr - - namespace expr{ - template - struct Plan< PaddingExp > { - public: - Plan(const PaddingExp &e) - : src_(MakePlan(e.src_)), pad_(e.pad_), new_height_(e.shape_[1]), - src_height_(e.src_height_), src_width_(e.src_width_) {} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - const index_t x = j; - const index_t y = i % new_height_; - const index_t c = i / new_height_; - if (y < pad_ || x < pad_) return 0.0f; - const index_t h = y - pad_; - const index_t w = x - pad_; - if (h < src_height_ && w < src_width_) { - return src_.Eval(c * src_height_ + h, w); - } else { - return 0.0f; - } - } - private: - Plan src_; - const index_t pad_; - const index_t new_height_; - const index_t src_height_; - const index_t src_width_; - }; - - template - struct Plan > { - public: - Plan(const CroppingExp &e) - : src_(MakePlan(e.src_)), pad_height_(e.pad_height_),pad_width_(e.pad_width_), - new_height_(e.shape_[1]), src_height_(e.src_height_) {} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - const index_t x = j; - const index_t y = i % new_height_; - const index_t c = i / new_height_; - const index_t h = y + pad_height_; - const index_t w = x + pad_width_; - return src_.Eval(c * src_height_ + h, w); - } - private: - Plan src_; - const index_t pad_height_, pad_width_; - const index_t new_height_; - const index_t src_height_; - }; - - template - struct Plan< MirroringExp > { - public: - Plan(const MirroringExp &e) - : src_(MakePlan(e.src_)), width_(e.shape_[0]){} - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - return src_.Eval( i, width_ - j - 1 ); - } - private: - Plan src_; - const index_t width_; - }; - }; // namespace expr - - namespace expr{ - template - struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > { - public: - Plan( const ChannelPoolingExp &e ) - : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]), - height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){ - } - MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { - using namespace std; - const index_t y = i % height_; - i /= height_; - const index_t c = i % channel_; - const index_t n = i / channel_; - const index_t x = j; - const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; - const index_t cend = min( c + hnsize_ + 1, channel_ ); - real_t res = Reducer::kInitV; - for( index_t cc = cstart; cc < cend; ++ cc ){ - Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + y, x ) ); - } - return res; - } - private: - Plan src_; - const index_t channel_, height_, width_, hnsize_; - }; - }; -}; // namespace mshadow - -#if MSHADOW_USE_SSE -// implementations of SSE support, if possible -#include "tensor_sse-inl.hpp" -namespace mshadow{ - namespace expr{ - template - struct SSECheck< Broadcast1DExp >{ - const static bool kPass = true; - }; - template - struct SSEAlignCheck<2, Broadcast1DExp >{ - inline static bool Check( const Broadcast1DExp &exp ){ - return sse2::CheckAlign( exp.src_.dptr ); - } - }; - template - class SSEPlan< Broadcast1DExp >{ - public: - SSEPlan( const Broadcast1DExp &t ) - :dptr_(t.src_.dptr){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::FVec( &dptr_[ x ] ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ x ]; - } - private: - const real_t *dptr_; - }; - }; -}; -#endif - -#endif - diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h new file mode 100644 index 000000000000..ffd203d33a1a --- /dev/null +++ b/mshadow/tensor_gpu-inl.h @@ -0,0 +1,174 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file tensor_gpu-inl.h + * \brief implementation of GPU host code + * \author Bing Xu, Tianqi Chen + */ +#ifndef MSHADOW_TENSOR_GPU_INL_H_ +#define MSHADOW_TENSOR_GPU_INL_H_ +#include "./base.h" +#include "./tensor.h" + +namespace mshadow { +#if MSHADOW_USE_CUDA +template<> +inline void InitTensorEngine(int dev_id) { + cudaDeviceProp prop; + int device_id = 0; + int device_count = 0; + cudaGetDeviceCount(&device_count); + utils::Check(device_count > 0, + "Cannot find CUDA device. Please check CUDA-Configuration"); + if (dev_id < 0) { + device_id = 0; + } else { + device_id = dev_id; + } + utils::Check(device_id < device_count, "Incorrect Device ID"); + utils::Check(cudaSetDevice(device_id) == cudaSuccess, "cannot set device"); + cudaGetDeviceProperties(&prop, device_id); + printf("Use CUDA Device %d: %s\n", device_id, prop.name); + cublasInit(); +} +template<> +inline void ShutdownTensorEngine(void) { + cublasShutdown(); +} +template<> +inline void SetDevice(int devid) { + utils::Check(cudaSetDevice(devid) == cudaSuccess, "cannot set device"); +} +template +inline void AllocSpace(Tensor *obj, bool pad) { + size_t pitch; + // common choice for cuda mem align unit is 32 + if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) { + cudaError_t err = + cudaMallocPitch(reinterpret_cast(&(obj->dptr_)), &pitch, + obj->size(dim - 1) * sizeof(DType), + obj->shape_.FlatTo2D()[0]); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + obj->stride_ = static_cast(pitch / sizeof(DType)); + } else { + obj->stride_ = obj->size(dim - 1); + cudaError_t err = + cudaMallocPitch(reinterpret_cast(&(obj->dptr_)), &pitch, + obj->shape_.Size() * sizeof(DType), 1); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } +} +template +inline void FreeSpace(Tensor *obj) { + cudaFree(obj->dptr_); obj->dptr_ = NULL; +} +template +inline void Copy(Tensor _dst, + Tensor _src, + cudaMemcpyKind kind, + Stream *stream) { + utils::Check(_dst.shape_ == _src.shape_, "Copy:shape mismatch"); + Tensor dst = _dst.FlatTo2D(); + Tensor src = _src.FlatTo2D(); + cudaError_t err = cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType), + src.dptr_, src.stride_ * sizeof(DType), + dst.size(1) * sizeof(DType), + dst.size(0), kind, + Stream::GetStream(stream)); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + // use synchronize call behavior for zero stream + if (stream == NULL) { + err = cudaStreamSynchronize(0); + utils::Check(err == cudaSuccess, cudaGetErrorString(err)); + } +} +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream) { + Copy(dst, src, cudaMemcpyDeviceToHost, stream); +} +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream) { + Copy(dst, src, cudaMemcpyDeviceToDevice, stream); +} +template +inline void Copy(Tensor dst, + const Tensor &src, + Stream *stream) { + Copy(dst, src, cudaMemcpyHostToDevice, stream); +} +#endif // MSHADOW_USE_CUDA +} // namespace mshadow + +// the following part is included only if compiler is nvcc +#ifdef __CUDACC__ +#include "./cuda/tensor_gpu-inl.cuh" + +namespace mshadow { +template +inline void MapExp(TRValue *dst, + const expr::Exp &exp) { + expr::TypeCheckPass::kMapPass> + ::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); + Shape eshape = expr::ShapeCheck::Check(exp.self()); + Shape dshape = expr::ShapeCheck::Check(dst->self()); + utils::Check(eshape[0] == 0 || eshape == dshape, + "Assignment: Shape of Tensors are not consistent with target"); + cuda::MapPlan(MakePlan(dst->self()), + MakePlan(exp.self()), + dshape.FlatTo2D(), + Stream::GetStream(expr::StreamInfo::Get(dst->self()))); +} + +template +inline void MapReduceKeepLowest(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + Shape<2> eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()).FlatTo2D(); + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[1] == dshape[0], + "MapReduceKeepLowest::reduction dimension do not match"); + utils::Check(eshape[0] != 0, "can not reduce over empty tensor"); + cuda::MapReduceKeepLowest + (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape, + Stream::GetStream(expr::StreamInfo::Get(dst->self()))); +} + +template +inline void MapReduceKeepHighDim(TRValue *dst, + const expr::Exp &exp, + DType scale) { + expr::TypeCheckPass::kRedPass> + ::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + typedef Shape::kDim> EShape; + EShape eshape = expr::ShapeCheck::kDim, E> + ::Check(exp.self()); + Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self()); + utils::Check(eshape[dimkeep] == dshape[0], + "MapReduceKeepHighDim::reduction dimension do not match"); + // use equvalent form + Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep), + eshape[dimkeep], + eshape.ProdShape(dimkeep + 1, EShape::kSubdim), + eshape[EShape::kSubdim]); + // call equavalent map red dim 2 + cuda::MapReduceKeepDim1 + (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape, + Stream::GetStream(expr::StreamInfo::Get(dst->self()))); +} +template +inline void Softmax(Tensor dst, + const Tensor& src) { + cuda::Softmax(dst, src); +} +} // namespace mshadow +#endif // __CUDACC__ +#endif // MSHADOW_TENSOR_GPU_INL_H_ diff --git a/mshadow/tensor_gpu-inl.hpp b/mshadow/tensor_gpu-inl.hpp deleted file mode 100644 index a2c1fc4a138f..000000000000 --- a/mshadow/tensor_gpu-inl.hpp +++ /dev/null @@ -1,148 +0,0 @@ -#ifndef MSHADOW_TENSOR_GPU_INL_HPP -#define MSHADOW_TENSOR_GPU_INL_HPP -/*! - * \file tensor_gpu-inl.hpp - * \brief implementation of GPU host code - * \author Bing Xu, Tianqi Chen - */ -#include "tensor.h" - -#if !(MSHADOW_USE_CUDA) -namespace mshadow { - // do nothing if no GPU operation is involved - inline void InitTensorEngine( int dev_id ){ - } - inline void ShutdownTensorEngine( void ){ - } -}; -#else -namespace mshadow { - #if (MSHADOW_USE_NVML) - inline int AutoSelectDevice(int device_count) { - // TODO nvml device id and cuda device id are not consistent - return 0; - } - #endif - inline void InitTensorEngine(int dev_id){ - cudaDeviceProp prop; - int device_id = 0; - int device_count = 0; - cudaGetDeviceCount(&device_count); - utils::Assert(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration"); - if (dev_id < 0) { - #if (MSHADOW_USE_NVML) - device_id = AutoSelectDevice(device_count); - #endif - } else { - device_id = dev_id; - } - utils::Assert( device_id < device_count, "Incorrect Device ID" ); - utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" ); - cudaGetDeviceProperties(&prop, device_id); - printf("Use CUDA Device %d: %s\n", device_id, prop.name); - cublasInit(); - } - inline void ShutdownTensorEngine( void ){ - cublasShutdown(); - } - - template - inline void AllocSpace(Tensor &obj, bool pad){ - size_t pitch; - // common choice for cuda mem align unit is 32 - if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){ - cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ - obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); - utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); - obj.shape.stride_ = static_cast( pitch / sizeof(real_t) ); - }else{ - obj.shape.stride_ = obj.shape[0]; - cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ - obj.shape.Size() * sizeof(real_t), 1 ); - utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); - } - } - - template - inline void FreeSpace(Tensor &obj){ - cudaFree( obj.dptr ); obj.dptr = NULL; - } - - template - inline void Copy(Tensor _dst, Tensor _src, cudaMemcpyKind kind){ - utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); - Tensor dst = _dst.FlatTo2D(); - Tensor src = _src.FlatTo2D(); - cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t), - src.dptr, src.shape.stride_ * sizeof(real_t), - dst.shape[0] * sizeof(real_t), - dst.shape[1], kind ); - utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); - } - template - inline void Copy(Tensor dst, const Tensor &src){ - Copy( dst, src, cudaMemcpyDeviceToHost ); - } - template - inline void Copy(Tensor dst, const Tensor &src){ - Copy( dst, src, cudaMemcpyDeviceToDevice ); - } - template - inline void Copy(Tensor dst, const Tensor &src){ - Copy( dst, src, cudaMemcpyHostToDevice ); - } -}; - -#ifdef __CUDACC__ -// the following part is included only if compiler is nvcc -#include "cuda/tensor_gpu-inl.cuh" - -namespace mshadow{ - template - inline void MapPlan(Tensor _dst, const expr::Plan &plan){ - cuda::MapPlan( _dst.FlatTo2D(), plan ); - } - - template - inline void MapExp(Tensor dst, const expr::Exp &exp ){ - using namespace expr; - TypeCheckPass< TypeCheck::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); - Shape eshape = ShapeCheck::Check( exp.self() ); - utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" ); - MapPlan( dst, MakePlan( exp.self() ) ); - } - - template - inline void MapReduceKeepLowest( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - Shape<2> eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ).FlatTo2D(); - - utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" ); - utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" ); - cuda::MapReduceKeepLowest( dst, MakePlan( exp.self() ), scale, eshape ); - } - - template - inline void MapReduceKeepHighDim( Tensor dst, const expr::Exp &exp, real_t scale ){ - using namespace expr; - TypeCheckPass< TypeCheck::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); - typedef Shape< ExpInfo::kDim > EShape; - EShape eshape = ShapeCheck< ExpInfo::kDim, E >::Check( exp.self() ); - utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" ); - // use equvalent form - Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], - eshape.ProdShape(1,dimkeep), eshape[0] ); - // call equavalent map red dim 2 - cuda::MapReduceKeepDim2( dst, MakePlan( exp.self() ), scale, pshape ); - } - - inline void Softmax( Tensor dst, const Tensor& src ){ - cuda::Softmax( dst, src ); - } -}; // namespace mshadow - -#endif // __CUDACC__ - -#endif // MSHADOW_USE_CUDA -#endif // TENSOR_GPU_INL_HPP diff --git a/mshadow/tensor_io.h b/mshadow/tensor_io.h deleted file mode 100644 index 2ce28b3a75e6..000000000000 --- a/mshadow/tensor_io.h +++ /dev/null @@ -1,137 +0,0 @@ -#ifndef MSHADOW_TENSOR_IO_H -#define MSHADOW_TENSOR_IO_H -/*! - * \file tensor_io.h - * \brief definitions of I/O functions for mshadow tensor - * \author Tianqi Chen - */ -#include -#include "tensor.h" - -namespace mshadow{ - namespace utils{ - /*! - * \brief interface of stream I/O, used to serialize data, - * it is not restricted to only this interface in SaveBinary/LoadBinary - * mshadow accept all class that implements Read and Write - */ - class IStream{ - public: - /*! - * \brief read data from stream - * \param ptr pointer to memory buffer - * \param size size of block - * \return usually is the size of data readed - */ - virtual size_t Read( void *ptr, size_t size ) = 0; - /*! - * \brief write data to stream - * \param ptr pointer to memory buffer - * \param size size of block - */ - virtual void Write( const void *ptr, size_t size ) = 0; - /*! \brief virtual destructor */ - virtual ~IStream( void ){} - }; - }; - - /*! - * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor storage will be allocated - * \param fo output binary stream - * \param src source data file - * \tparam dim dimension of tensor - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void SaveBinary( TStream &fo, const Tensor &src ); - /*! \brief refer to comment of cpu ver \sa SaveBinary */ - template - inline void SaveBinary( TStream &fo, const Tensor &src ); - - /*! - * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor storage will be allocated - * if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded - * if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst - * \param fi output binary stream - * \param dst destination file - * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen - * \tparam dim dimension of tensor - * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. - */ - template - inline void LoadBinary( TStream &fi, Tensor &dst, bool pre_alloc ); - /*! \brief refer to comment of cpu ver \sa LoadBinary */ - template - inline void LoadBinary( TStream &fi, Tensor &dst, bool pre_alloc ); - - namespace utils{ - /*! \brief implementation of file i/o stream */ - class FileStream: public IStream{ - public: - /*! \brief constructor */ - FileStream( FILE *fp ):fp_(fp){} - virtual size_t Read( void *ptr, size_t size ){ - return fread( ptr, size, 1, fp_ ); - } - virtual void Write( const void *ptr, size_t size ){ - fwrite( ptr, size, 1, fp_ ); - } - /*! \brief close file */ - inline void Close( void ){ - fclose( fp_ ); - } - private: - FILE *fp_; - }; - }; -}; - -namespace mshadow{ - // implementations - template - inline void SaveBinary( TStream &fo, const Tensor &src_ ){ - fo.Write( src_.shape.shape_, sizeof(index_t) * dim ); - Tensor src = src_.FlatTo2D(); - for( index_t i = 0; i < src.shape[1]; ++ i ){ - fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] ); - } - } - template - inline void SaveBinary( TStream &fo, const Tensor &src ){ - // copy to CPU, then save - Tensor tmp( src.shape ); - AllocSpace( tmp ); - Copy( tmp, src ); - SaveBinary( fo, tmp ); - FreeSpace( tmp ); - } - - template - inline void LoadBinary( TStream &fi, Tensor &dst_, bool pre_alloc ){ - Shape shape; - utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, "mshadow::LoadBinary" ); - if( pre_alloc ){ - utils::Assert( shape == dst_.shape ); - }else{ - dst_.shape = shape; AllocSpace( dst_ ); - } - Tensor dst = dst_.FlatTo2D(); - if( dst.shape[0] == 0 ) return; - for( index_t i = 0; i < dst.shape[1]; ++ i ){ - utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) != 0, "mshadow::LoadBinary" ); - } - } - template - inline void LoadBinary( TStream &fi, Tensor &dst, bool pre_alloc ){ - Tensor tmp; - LoadBinary( fi, tmp, false ); - if( pre_alloc ){ - utils::Assert( tmp.shape == dst.shape ); - }else{ - dst.shape = tmp.shape; AllocSpace( dst ); - } - Copy( dst, tmp ); - FreeSpace( tmp ); - } -}; -#endif // TENSOR_IO_H diff --git a/mshadow/tensor_random.h b/mshadow/tensor_random.h deleted file mode 100644 index b3f0b8498e0c..000000000000 --- a/mshadow/tensor_random.h +++ /dev/null @@ -1,299 +0,0 @@ -#ifndef MSHADOW_TENSOR_RANDOM_H -#define MSHADOW_TENSOR_RANDOM_H -/*! - * \file tensor_random.h - * \brief Random inline functions for tensor. - * \author Bing Xu, Tianqi Chen - * Based on curand|MKL|stdlib - */ -#include -#include "tensor.h" -#include "tensor_container.h" - -namespace mshadow { - /*! - * \brief random number generator - * \tparam Device the device of random number generator - */ - template - class Random {}; - - /*! \brief CPU random number generator */ - template<> - class Random { - public: - /*! - * \brief constructor of random engine - * \param seed random number seed - */ - Random( int seed ){ - #if MSHADOW_USE_MKL - int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed); - utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine failed to be initialized.\n" ); - #else - srand(seed); - #endif - buffer_.Resize( Shape1( kRandBufferSize ) ); - } - ~Random() { - #if MSHADOW_USE_MKL - vslDeleteStream(&vStream_); - #endif - } - /*! - * \brief seed random number generator using this seed - * \param seed seed of prng - */ - inline void Seed( int seed ){ - #if MSHADOW_USE_MKL - int status = vslDeleteStream(&vStream_); - utils::Assert(status == VSL_STATUS_OK); - status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed); - utils::Assert(status == VSL_STATUS_OK); - #else - srand( seed ); - #endif - } - /*! - * \brief generate data from uniform [a,b) - * \param dst destination - * \param a lower bound of uniform - * \param b upper bound of uniform - * \tparam dim dimension of tensor - */ - template - inline void SampleUniform( Tensor &dst, real_t a=0.0f, real_t b=1.0f ) { - Tensor mat = dst.FlatTo2D(); - for ( index_t i = 0; i < mat.shape[1]; ++i ) { - #if MSHADOW_USE_MKL - #if MSHADOW_SINGLE_PRECISION - int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b ); - #else - int status = vdRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b ); - #endif - utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" ); - #else - // use stdlib - for ( index_t j = 0; j < mat.shape[0]; ++j ) { - mat[i][j] = this->RandNext()*(b-a) + a; - } - #endif - } - } - /*! - * \brief generate data from standard gaussian - * \param dst destination - * \param mu mean variable - * \param sigma standard deviation - * \tparam dim dimension of tensor - */ - template - inline void SampleGaussian( Tensor &dst, real_t mu = 0.0f, real_t sigma = 1.0f ) { - if( sigma <= 0.0f ) { - dst = mu; return; - } - Tensor mat = dst.FlatTo2D(); - for (index_t i = 0; i < mat.shape[1]; ++i) { - #if MSHADOW_USE_MKL - #if MSHADOW_SINGLE_PRECISION - int status = vsRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma ); - #else - int status = vdRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma ); - #endif - utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" ); - #else - real_t g1 = 0.0f, g2 = 0.0f; - for (index_t j = 0; j < mat.shape[0]; ++j) { - if( (j & 1) == 0 ){ - this->SampleNormal2D( g1, g2 ); - mat[i][j] = mu + g1 * sigma; - }else{ - mat[i][j] = mu + g2 * sigma; - } - } - #endif - } - } - /*! - * \brief return a temporal expression storing standard gaussian random variables - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> gaussian( Shape shape ){ - buffer_.Resize( Shape1( shape.Size() ) ); - this->SampleGaussian( buffer_, 0.0f, 1.0f ); - return expr::reshape( buffer_, shape ); - } - /*! - * \brief return a temporal expression storing standard uniform [0,1) - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> uniform( Shape shape ){ - buffer_.Resize( Shape1( shape.Size() ) ); - this->SampleUniform( buffer_, 0.0f, 1.0f ); - return expr::reshape( buffer_, shape ); - } - private: - /*! \brief get next random number from rand */ - inline real_t RandNext( void ){ - return static_cast(rand()) / (static_cast(RAND_MAX)+1.0f); - } - /*! \brief return a real numer uniform in (0,1) */ - inline real_t RandNext2( void ){ - return (static_cast( rand() ) + 1.0 ) / (static_cast(RAND_MAX) + 2.0); - } - /*! - * \brief sample iid xx,yy ~N(0,1) - * \param xx first gaussian output - * \param yy second gaussian output - */ - inline void SampleNormal2D( real_t &xx, real_t &yy ){ - real_t x,y,s; - do{ - x = 2.0f * RandNext2() - 1.0f; - y = 2.0f * RandNext2() - 1.0f; - s = x*x + y*y; - }while( s >= 1.0f || s == 0.0f ); - real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ; - xx = x * t; yy = y * t; - } - private: - #if MSHADOW_USE_MKL - /*! \brief stream used by MKL VSL */ - VSLStreamStatePtr vStream_; - #endif - /*! \brief temporal space used to store random numbers */ - TensorContainer buffer_; - }; // class Random - -#ifdef __CUDACC__ - - /*! \brief GPU random number generator */ - template<> - class Random { - public: - /*! - * \brief constructor of random engine - * \param seed random number seed - */ - Random(int seed) { - curandStatus_t status; - status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT); - utils::Assert(status == CURAND_STATUS_SUCCESS, "Can not create CURAND Generator"); - this->Seed( seed ); - buffer_.Resize( Shape1(kRandBufferSize) ); - } - - ~Random() { - curandStatus_t status; - status = curandDestroyGenerator(gen_); - utils::Assert(status == CURAND_STATUS_SUCCESS, "Destory CURAND Gen failed"); - } - /*! - * \brief seed random number generator using this seed - * \param seed seed of prng - */ - inline void Seed( int seed ){ - curandStatus_t status; - status = curandSetPseudoRandomGeneratorSeed(gen_, seed); - utils::Assert(status == CURAND_STATUS_SUCCESS, "Set CURAND seed failed."); - } - /*! - * \brief generate data from uniform [a,b) - * \param dst destination - * \param a lower bound of uniform - * \param b upper bound of uniform - * \tparam dim dimension of tensor - */ - template - inline void SampleUniform(Tensor &dst, real_t a=0.0f, real_t b=1.0f) { - if( a == 0.0f && b == 1.0f ){ - dst = this->uniform( dst.shape ); - }else{ - dst = this->uniform( dst.shape ) *(b-a) + a; - } - } - /*! - * \brief generate data from standard gaussian - * \param dst destination - * \param mu mean variable - * \param sigma standard deviation - * \tparam dim dimension of tensor - */ - template - inline void SampleGaussian(Tensor &dst, real_t mu = 0.0f, real_t sigma = 1.0f) { - dst = this->gaussian( dst.shape, mu, sigma ); - } - /*! - * \brief return a temporal expression storing standard gaussian random variables - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \param mu mean - * \param sigma variance - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> gaussian( Shape shape, real_t mu=0.0f, real_t sigma=1.0f){ - size_t aligned_sz = ((shape.Size() + 1UL)>>1)<<1; - // allocate alligned size - buffer_.Resize( Shape1( aligned_sz ) ); - buffer_.Resize( Shape1( shape.Size() ) ); - curandStatus_t status; - #if MSHADOW_SINGLE_PRECISION - status = curandGenerateNormal(gen_, buffer_.dptr, aligned_sz , mu, sigma); - #else - status = curandGenerateNormalDouble(gen_, buffer_.dptr, buffer_.shape[0], mu, sigma); - #endif - utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n"); - return expr::reshape( buffer_, shape ); - } - /*! - * \brief return a temporal expression storing standard uniform [0,1) - * the temporal tensor is only valid before next call of gaussian or uniform - * can be used as part of expression - * Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result, - * since second call of gaussian(s2) makes gaussian(s1) invalid - * A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression - * \param shape shape of the tensor - * \tparam dim dimension of tensor - */ - template - inline expr::ReshapeExp,dim,1> uniform(Shape shape) { - buffer_.Resize( Shape1( shape.Size() ) ); - curandStatus_t status; - #if MSHADOW_SINGLE_PRECISION - status = curandGenerateUniform(gen_, buffer_.dptr, buffer_.shape[0] ); - #else - status = curandGenerateUniformDouble(gen_, buffer_.dptr, buffer_.shape[0] ); - #endif - utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n"); - return expr::reshape( buffer_, shape ); - } - private: - /*! \brief random numbeer generator */ - curandGenerator_t gen_; - /*! \brief templ buffer */ - TensorContainer buffer_; - }; // class Random - #endif - -}; // namespace mshadow - -#endif // MSHADOW_TENSOR_RANDOM_H diff --git a/mshadow/tensor_sse-inl.hpp b/mshadow/tensor_sse-inl.hpp deleted file mode 100644 index b98383e83d6a..000000000000 --- a/mshadow/tensor_sse-inl.hpp +++ /dev/null @@ -1,431 +0,0 @@ -#ifndef MSHADOW_TENSOR_SSE_INL_HPP -#define MSHADOW_TENSOR_SSE_INL_HPP -/*! - * \file tensor_sse-inl.hpp - * \brief support of sse2 optimization of some operations - * \author Tianqi Chen - */ -#ifdef __APPLE__ -#include -#else -#include -#endif - -#include "tensor_expr.h" -#include "tensor.h" - -namespace mshadow { - /*! \brief namespace to support sse2 vectorization */ - namespace sse2{ - /*! - * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells - * \param pitch output parameter, the actuall space allocated for each line - * \param lspace number of cells required for each line - * \param num_line number of lines to be allocated - */ - inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){ - pitch = ((lspace+15) >> 4) << 4; - #ifdef _MSC_VER - void * res = _aligned_malloc( pitch*num_line, 16 ); - #else - #ifdef __APPLE__ - void *res = malloc( pitch * num_line ); - #else - void * res = memalign( 16, pitch*num_line ); - #endif - #endif - utils::Assert( res != NULL, "AlignedMallocPitch failed" ); - return res; - } - /*! - * \brief free aligned space - * \param ptr pointer to space to be freed - */ - inline void AlignedFree( void *ptr ){ - #ifdef _MSC_VER - _aligned_free( ptr ); - #else - free( ptr ); - #endif - } - /*! \brief check if a pointer is aligned */ - inline bool CheckAlign( size_t pitch ){ - return !(pitch & ((1<<4)-1)); - } - /*! \brief check if a pointer is aligned */ - inline bool CheckAlign( void *ptr ){ - return CheckAlign( (size_t)ptr ); - } - /*! - * \brief get upper bound of aligned index of size - * \param size size of the array - * \param fsize size of float - */ - inline index_t UpperAlign( index_t size, size_t fsize ){ - return (( (size*fsize+15) >> 4 ) << 4) / fsize; - } - /*! - * \brief get lower bound of aligned index of size - * \param size size of the array - * \param fsize size of float - */ - inline index_t LowerAlign( index_t size, size_t fsize ){ - return (( (size*fsize) >> 4 ) << 4) / fsize; - } - }; // namespace sse2 -}; // namespace mshadow - -#if MSHADOW_USE_SSE -// sse types are not compatible with nvcc, only use them in cpu mode -#include - -namespace mshadow{ - namespace sse2{ - /*! - * \brief float vector real type, used for vectorization - * \tparam FloatType double or float - */ - template struct FVec{}; - - /*! \brief vector real type for float */ - template<> - struct FVec { - public: - typedef __m128 DType; - /*! \brief number of float in vector */ - const static index_t kSize = 4; - /*! \brief data content */ - DType data_; - public: - /* constructors */ - FVec( void ){} - FVec( DType data ):data_(data){} - /* set the float */ - FVec( const float &s ){ - data_ = _mm_set1_ps( s ); - } - /*!\brief load from pointer src */ - FVec( const float *src ){ - data_ = _mm_load_ps( src ); - } - public: - /*! \brief store data into dst space */ - inline void Store( float *dst ) const{ - return _mm_store_ps( dst, data_ ); - } - /*! \brief sum of all content */ - inline float Sum( void ) const{ - DType ans = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) ); - DType rst = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) ); - #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64) - return rst.m128_f32[ 0 ]; - #else - float rr = _mm_cvtss_f32( rst ) ; - return rr; - #endif - } - }; - - /*! \brief vector real type for float */ - template<> - struct FVec { - public: - typedef __m128d DType; - /*! \brief number of float in vector */ - const static index_t kSize = 2; - /*! \brief data content */ - DType data_; - public: - /* constructors */ - FVec( void ){} - FVec( DType data ):data_(data){} - /* set the float */ - FVec( const double &s ){ - data_ = _mm_set1_pd( s ); - } - /*!\brief load from pointer src */ - FVec( const double *src ){ - data_ = _mm_load_pd( src ); - } - public: - /*! \brief store data into dst space */ - inline void Store( double *dst ) const{ - return _mm_store_pd( dst, data_ ); - } - /*! \brief sum of all content */ - inline double Sum( void ) const{ - DType tmp = _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ; - #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64) - return tmp.m128d_f64[0]; - #else - double ans = _mm_cvtsd_f64( tmp ); - return ans; - #endif - } - }; - }; - - namespace sse2{ - /*! \brief sse2 operator type of certain operator */ - template - struct SSEOp{ - const static bool kEnabled = false; - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_add_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_add_pd( lhs.data_, rhs.data_ ) ); - } - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_sub_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_sub_pd( lhs.data_, rhs.data_ ) ); - } - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_mul_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_mul_pd( lhs.data_, rhs.data_ ) ); - } - }; - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_div_ps( lhs.data_, rhs.data_ ) ); - } - MSHADOW_CINLINE static FVec Map( const FVec &lhs, const FVec &rhs ){ - return FVec( _mm_div_pd( lhs.data_, rhs.data_ ) ); - } - }; - - template<> - struct SSEOp{ - const static bool kEnabled = true; - MSHADOW_CINLINE static FVec Map( const FVec &src ){ - return src; - } - MSHADOW_CINLINE static FVec Map( const FVec &src ){ - return src; - } - }; - }; // namespace sse2 - - namespace sse2{ - // savers to do storage - template - struct Saver{ - MSHADOW_CINLINE static void Save( TFloat *dst, const FVec &src ){ - FVec lhs( dst ); - FVec ans = SSEOp::Map( lhs, src ); - ans.Store( dst ); - } - }; - template - struct Saver{ - MSHADOW_CINLINE static void Save( TFloat *dst, const FVec &src ){ - src.Store( dst ); - } - }; - }; // namespace sse2 -}; // namespace mshadow - -namespace mshadow{ - namespace expr{ - // same as plan, but use sse2 - template - class SSEPlan { - public: - /*! - * \brief evaluate the expression at index [y][x], x will be aligned to 4 - * to be implemented by SubType - */ - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const; - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const; - }; - - template - class SSEPlan< Tensor >{ - public: - SSEPlan( const Tensor &t ) - :dptr_(t.dptr),stride_(t.shape.stride_){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::FVec( &dptr_[ y*stride_+x ] ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return dptr_[ y * stride_ + x ]; - } - private: - const real_t *dptr_; - index_t stride_; - }; - - template<> - class SSEPlan{ - public: - SSEPlan( real_t scalar ):scalar_(scalar){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::FVec( scalar_ ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return scalar_; - } - private: - real_t scalar_; - }; - - template - class SSEPlan< BinaryMapExp >{ - public: - SSEPlan( const SSEPlan &lhs, const SSEPlan &rhs ) - :lhs_(lhs), rhs_(rhs){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::SSEOp::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) ); - } - private: - SSEPlan lhs_; - SSEPlan rhs_; - }; - - template - class SSEPlan< UnaryMapExp >{ - public: - SSEPlan( const SSEPlan &src ):src_(src){} - MSHADOW_CINLINE sse2::FVec EvalSSE( index_t y, index_t x ) const{ - return sse2::SSEOp::Map( src_.EvalSSE( y, x ) ); - } - MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ - return OP::Map( src_.Eval( y, x ) ); - } - private: - SSEPlan src_; - }; - - template - inline SSEPlan< BinaryMapExp > MakeSSEPlan( const BinaryMapExp &e ); - - inline SSEPlan MakeSSEPlan( const ScalarExp &e ){ - return SSEPlan( e.scalar_ ); - } - - template - inline SSEPlan MakeSSEPlan( const ContainerExp &e ){ - return SSEPlan( e.self() ); - } - - template - inline SSEPlan MakeSSEPlan( const MakeTensorExp &e ){ - return SSEPlan( e.real_self() ); - } - - template - inline SSEPlan< UnaryMapExp > MakeSSEPlan( const UnaryMapExp &e ){ - return SSEPlan< UnaryMapExp >( MakeSSEPlan(e.src_) ); - } - - template - inline SSEPlan< BinaryMapExp > MakeSSEPlan( const BinaryMapExp &e ){ - return SSEPlan< BinaryMapExp >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) ); - } - }; - - namespace expr{ - /*! - * \brief static check sse enable - * if a expression E can not be evaluated using sse, then kPass = false - * \tparam Device the type of Device - * \tparam dim dimension of the tensor - * \tparam E expression - */ - template - struct SSECheck{ - const static bool kPass = false; - }; - template<> - struct SSECheck{ - const static bool kPass = true; - }; - template - struct SSECheck >{ - const static bool kPass = true; - }; - - template - struct SSECheck >{ - const static bool kPass = SSECheck::kPass && sse2::SSEOp::kEnabled; - }; - template - struct SSECheck< BinaryMapExp >{ - const static bool kPass = SSECheck::kPass && SSECheck::kPass && sse2::SSEOp::kEnabled; - }; - }; // namespace expr - namespace expr{ - // check if data is aligned and allow sse operation - template - struct SSEAlignCheck{ - inline static bool Check( const E &exp ){ - return false; - } - }; - template - struct SSEAlignCheck< dim, ScalarExp >{ - inline static bool Check( const ScalarExp &exp ){ - return true; - } - }; - template - struct SSEAlignCheck< dim,Tensor >{ - inline static bool Check( const Tensor &t ){ - return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) ); - } - }; - template - struct SSEAlignCheck< dim, UnaryMapExp >{ - inline static bool Check( const UnaryMapExp &t ){ - return SSEAlignCheck::Check( t.src_); - } - }; - template - struct SSEAlignCheck< dim, BinaryMapExp >{ - inline static bool Check( const BinaryMapExp &t ){ - return SSEAlignCheck::Check( t.lhs_ ) && - SSEAlignCheck::Check( t.rhs_ ); - } - }; - }; // namespace expr - - /*! - * \brief use SSEPlan to compute result - */ - template - inline void MapSSEPlan(Tensor _dst, const expr::SSEPlan &plan){ - Tensor dst = _dst.FlatTo2D(); - const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) ); - for ( index_t y = 0; y < dst.shape[1]; y ++ ) { - for( index_t x = 0; x < xlen; x += sse2::FVec::kSize ){ - sse2::Saver::Save( &dst[y][x], plan.EvalSSE( y,x ) ); - } - for( index_t x = xlen; x < dst.shape[0]; x ++ ){ - SV::Save( dst[y][x], plan.Eval(y,x) ); - } - } - } -}; // namespace mshadow -#endif // MSHADOW_USE_SSE -#endif // MSHADOW_TENSOR_SSE_INL_HPP diff --git a/mshadow/utils.h b/mshadow/utils.h new file mode 100644 index 000000000000..6003f5562814 --- /dev/null +++ b/mshadow/utils.h @@ -0,0 +1,81 @@ +/*! + * Copyright (c) 2014 by Contributors + * \file utils.h + * \brief simple utils for error and checkings + * \author Tianqi Chen + */ +#ifndef MSHADOW_UTILS_H_ +#define MSHADOW_UTILS_H_ +#define _CRT_SECURE_NO_WARNINGS +#include +#include +#include +#include +namespace mshadow { +/*! \brief namespace for helper utils of the project */ +namespace utils { +/*! \brief error message buffer length */ +const int kPrintBuffer = 1 << 12; + +#ifndef MSHADOW_CUSTOMIZE_ASSERT_ +/*! + * \brief handling of Assert error, caused by in-apropriate input + * \param msg error message + */ +inline void HandleAssertError(const char *msg) { + fprintf(stderr, "AssertError:%s\n", msg); + exit(-1); +} +/*! + * \brief handling of Check error, caused by in-apropriate input + * \param msg error message + */ +inline void HandleCheckError(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(-1); +} +#else +// include declarations, some one must implement this +void HandleAssertError(const char *msg); +void HandleCheckError(const char *msg); +void HandlePrint(const char *msg); +#endif + +/*! \brief assert an condition is true, use this to handle debug information */ +inline void Assert(bool exp, const char *fmt, ...) { + if (!exp) { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandleAssertError(msg.c_str()); + } +} + +/*!\brief same as assert, but this is intended to be used as message for user*/ +inline void Check(bool exp, const char *fmt, ...) { + if (!exp) { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandleCheckError(msg.c_str()); + } +} + +/*! \brief report error message, same as check */ +inline void Error(const char *fmt, ...) { + { + std::string msg(kPrintBuffer, '\0'); + va_list args; + va_start(args, fmt); + vsnprintf(&msg[0], kPrintBuffer, fmt, args); + va_end(args); + HandleCheckError(msg.c_str()); + } +} +} // namespace utils +} // namespace mshadow +#endif // MSHADOW_UTILS_H_ diff --git a/example/neuralnet/Makefile.openblas b/test/Makefile similarity index 56% rename from example/neuralnet/Makefile.openblas rename to test/Makefile index ef82c1115df7..061b99b2e119 100644 --- a/example/neuralnet/Makefile.openblas +++ b/test/Makefile @@ -1,24 +1,21 @@ # set LD_LIBRARY_PATH -# echo "Link mshadow with precomplied Openblas" -export OPENBLAS_ROOT=../../../OpenBLAS-v0.2.13-Win64-int32 export CC = gcc export CXX = g++ export NVCC =nvcc -export CFLAGS = -Wall -O3 -msse3 -Wno-unknown-pragmas -funroll-loops -I../../ -I$(OPENBLAS_ROOT)/include -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CBLAS=1 -D__APPLE__ -export LDFLAGS= -static -lpthread -lopenblas -L$(OPENBLAS_ROOT)/lib +export CFLAGS = -Wall -O3 -g -msse3 -Wno-unknown-pragmas -funroll-loops -I../ +export LDFLAGS= -g -lm -lcublas -lcudart export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) # specify tensor path -BIN = nnet convnet +BIN = OBJ = CUOBJ = -CUBIN = +CUBIN = test .PHONY: clean all -all: $(BIN) $(OBJ) $(CUBIN) $(CUOBJ) +all: $(CUBIN) $(OBJ) -nnet: nnet.cpp -convnet: convnet.cpp +test: test.cu $(BIN) : $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) diff --git a/test/pairtest.cu b/test/pairtest.cu new file mode 100644 index 000000000000..56b0380747a7 --- /dev/null +++ b/test/pairtest.cu @@ -0,0 +1,105 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist, int I, int J) { + for (int i = 0; i < I; ++i) { + for (int j = 0; j < J; ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int X = 6; + const int K = 2; + const int O = (X - K) / 2 + 1; + mshadow::TensorContainer srcm(mshadow::Shape4(1,1,X, X)); + Xmshadow::TensorContainer srcx(Xmshadow::Shape4(1,1,X, X)); + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + srcm[0][0][i][j] = i * 0.1f + j * 0.1f; + srcx[0][0][i][j] = i * 0.1f + j * 0.1f; + } + } + printf("Source:\n"); + Print(srcm[0][0], X, X); + printf("\n"); + mshadow::TensorContainer mct(mshadow::Shape4(1,1,X, X)); + Xmshadow::TensorContainer xct(Xmshadow::Shape4(1,1,X, X)); + mshadow::Copy(mct, srcm); + Xmshadow::Copy(xct, srcx); + + + mshadow::TensorContainer pool_ct(mshadow::Shape4(1,1, O, O)); + Xmshadow::TensorContainer pool_xct(Xmshadow::Shape4(1,1,O,O)); + + pool_ct = mshadow::expr::pool(mct, K, K, K); + pool_xct = Xmshadow::expr::pool(xct, K, K); + + printf("New pool:\n"); + Print(pool_ct[0][0], O, O); + printf("\nOld pool:\n"); + Print(pool_xct[0][0], O, O); + printf("\n"); + mshadow::TensorContainer gpool_src(mshadow::Shape4(1,1, O, O)); + Xmshadow::TensorContainer gpool_xsrc(Xmshadow::Shape4(1,1,O,O)); + for (int i = 0; i < O; ++i) { + for (int j = 0; j < O; ++j) { + gpool_src[0][0][i][j] = 0.1f; + gpool_xsrc[0][0][i][j] = 0.1f; + } + } + mshadow::TensorContainer gpool_ct(mshadow::Shape4(1,1, O, O)); + Xmshadow::TensorContainer gpool_xct(Xmshadow::Shape4(1,1,O,O)); + mshadow::Copy(gpool_ct, gpool_src); + Xmshadow::Copy(gpool_xct, gpool_xsrc); + + mshadow::TensorContainer mout(mshadow::Shape4(1,1,X, X)); + Xmshadow::TensorContainer xout(Xmshadow::Shape4(1,1,X, X)); + + mout = mshadow::expr::unpool(mct, pool_ct, gpool_ct, K, K, K); + xout = Xmshadow::expr::unpool(xct, pool_xct, gpool_xct, K, K); + + mshadow::Copy(srcm, mout); + Xmshadow::Copy(srcx, xout); + + mshadow::TensorContainer l1(mshadow::Shape2(X,X)); + Xmshadow::TensorContainer l2(Xmshadow::Shape2(X, X)); + l1 = mshadow::expr::reshape(srcm, l1.shape_); + l2 = Xmshadow::expr::reshape(srcx, l2.shape); + printf("New unpool\n"); + Print(l1, l1.size(0), l1.size(1)); + printf("\nOld unpool\n"); + Print(l2, X, X); + if (Check(l1, l2)) { + printf("Pass\n"); + } +} + +int main(int argc, char** argv) { + if (argc < 1) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file diff --git a/test/pool.cu b/test/pool.cu new file mode 100644 index 000000000000..9641d53c9c45 --- /dev/null +++ b/test/pool.cu @@ -0,0 +1,69 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist) { + for (int i = 0; i < ist.size(0); ++i) { + for (int j = 0; j < ist.size(1); ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int X = 6; + const int K = 2; + mshadow::TensorContainer srcm(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer srcx(Xmshadow::Shape2(X, X)); + + mshadow::TensorContainer mct(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer xct(Xmshadow::Shape2(X, X)); + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + srcm[i][j] = i * 0.1f + j * 0.1f; + srcx[i][j] = i * 0.1f + j * 0.1f; + } + } + mshadow::Copy(mct, srcm); + Xmshadow::Copy(xct, srcx); + mshadow::TensorContainer pool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + Xmshadow::TensorContainer pool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + + pool_ct = mshadow::expr::pool(mct, K, K, K); + pool_xct = Xmshadow::expr::pool(xct, K, K); + + mshadow::TensorContainer cpool_ct(mshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + Xmshadow::TensorContainer cpool_xct(Xmshadow::Shape2((X-K)/2+1, (X-K)/2+1)); + mshadow::Copy(cpool_ct, pool_ct); + Xmshadow::Copy(cpool_xct, pool_xct); + if (Check(cpool_ct, cpool_xct)) { + printf("Pass\n"); + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file diff --git a/test/reshape.cu b/test/reshape.cu new file mode 100644 index 000000000000..c1ad52e07c40 --- /dev/null +++ b/test/reshape.cu @@ -0,0 +1,74 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist) { + for (int i = 0; i < ist.size(0); ++i) { + for (int j = 0; j < ist.size(1); ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int X = 6; + const int K = 2; + mshadow::TensorContainer srcm(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer srcx(Xmshadow::Shape2(X, X)); + + mshadow::TensorContainer mct(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer xct(Xmshadow::Shape2(X, X)); + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + srcm[i][j] = i * 0.1f + j * 0.1f; + srcx[i][j] = i * 0.1f + j * 0.1f; + } + } + mshadow::Copy(mct, srcm); + Xmshadow::Copy(xct, srcx); + + mshadow::TensorContainer mct4d(mshadow::Shape4(1, 1, X / K, X * K)); + Xmshadow::TensorContainer xct4d(Xmshadow::Shape4(X / K, X * K, 1, 1)); + + mct4d = mshadow::expr::reshape(mct, mct4d.shape_); + xct4d = Xmshadow::expr::reshape(xct, xct4d.shape); + + mct = mshadow::expr::reshape(mct4d, mct.shape_); + xct = Xmshadow::expr::reshape(xct4d, xct.shape); + + mshadow::TensorContainer m_ct(mshadow::Shape2(X, X)); + Xmshadow::TensorContainer x_ct(Xmshadow::Shape2(X, X)); + + mshadow::Copy(m_ct, mct); + Xmshadow::Copy(x_ct, xct); + if (Check(m_ct, x_ct)) { + printf("Pass\n"); + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file diff --git a/test/test.cu b/test/test.cu new file mode 100644 index 000000000000..37fe7e76cbd0 --- /dev/null +++ b/test/test.cu @@ -0,0 +1,77 @@ +#include "test.h" + +using namespace mshadow; + + +int main() { + InitTensorEngine(); + Tensor tc = NewTensor(Shape3(3, 2, 4), 0.0f); + Tensor tg = NewTensor(tc.shape_, 0.0f); + // init + for (index_t i = 0; i < tc.size(0); ++i) { + for (index_t j = 0; j < tc.size(1); ++j) { + for (index_t k = 0; k < tc.size(2); ++k) { + tc[i][j][k] = i * 0.1f + j * 0.2f + k * 0.1f; + } + } + } + Copy(tg, tc); + // print + printf("\n#print batch 0 of cpu tensor:\n"); + Print2DTensor(tc[0]); + printf("\n"); + Print2DTensor(tc[1]); + printf("\n"); + Print2DTensor(tc[2]); + // check + if (Check2DTensor(tg[1], tc[1])) { + printf("batch 1 of gpu & cpu tensor are same.\n"); + } + // sum of row + Tensor tmp_tc = NewTensor(Shape1(tc[0].size(1)), 0.0f); + Tensor tmp_tg = NewTensor(Shape1(tg[0].size(1)), 0.0f); + printf("\n#sum_rows of batch 0:\n"); + tmp_tc = sum_rows(tc[0]); + tmp_tg = sum_rows(tg[0]); + Print1DTensor(tmp_tc); + if (Check1DTensor(tmp_tg, tmp_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&tmp_tc); + FreeSpace(&tmp_tg); + // sumall_except_dim + printf("\n#sumall_except_dim<0> of batch 0:\n"); + Tensor red_tc = NewTensor(Shape1(tc.size(0)), 0.0f); + Tensor red_tg = NewTensor(Shape1(tg.size(0)), 0.0f); + red_tc = sumall_except_dim<0>(tc); + red_tg = sumall_except_dim<0>(tg); + Print1DTensor(red_tc); + if (Check1DTensor(red_tg, red_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&red_tc); + FreeSpace(&red_tg); + // softmax + printf("\n#Softmax\n"); + Tensor sm_tc = NewTensor(tc[0].shape_, 0.0f); + Tensor sm_tg = NewTensor(tg[0].shape_, 0.0f); + Softmax(sm_tc, tc[0]); + Softmax(sm_tg, tg[0]); + if (Check2DTensor(sm_tg, sm_tc)) { + printf("cpu & gpu result consists\n"); + } + // mirror + printf("\n#mirror\n"); + sm_tc = mirror(tc[0]); + sm_tg = mirror(tg[0]); + if (Check2DTensor(sm_tg, sm_tc)) { + printf("cpu & gpu result consists\n"); + } + FreeSpace(&sm_tc); + FreeSpace(&sm_tg); + // reshape + + FreeSpace(&tc); + FreeSpace(&tg); + ShutdownTensorEngine(); +} diff --git a/test/test.h b/test/test.h new file mode 100644 index 000000000000..2cfc515957ca --- /dev/null +++ b/test/test.h @@ -0,0 +1,67 @@ +#ifndef TEST_H +#define TEST_H + +#include "mshadow/tensor.h" +#include "assert.h" + +#define EPS 0.0001 +using namespace mshadow; +using namespace mshadow::expr; + + +template +void Print2DTensor(Tensor const &ts); + +template +void Print1DTensor(Tensor const &ts); + +template<> +void Print1DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + printf("%.2f ", ts[i]); + } + printf("\n"); +} + + +template<> +void Print2DTensor(Tensor const &ts) { + for (index_t i = 0; i < ts.size(0); ++i) { + Print1DTensor(ts[i]); + } +} + +template<> +void Print2DTensor(Tensor const &tg) { + Tensor tc = NewTensor(tg.shape_, 0.0f); + Copy(tc, tg); + Print2DTensor(tc); + FreeSpace(&tc); +} + + + +bool Check2DTensor(Tensor const &tg, Tensor const &tc) { + Tensor tcc = NewTensor(tg.shape_, 0.0f); + Copy(tcc, tg); + for (index_t i = 0; i < tc.size(0); ++i) { + for (index_t j = 0; j < tc.size(1); ++j) { + assert(abs(tcc[i][j] - tc[i][j]) < EPS); + } + } + FreeSpace(&tcc); + return true; +} + +bool Check1DTensor(Tensor const &tg, Tensor const &tc) { + Tensor tcc = NewTensor(tc.shape_, 0.0f); + Copy(tcc, tg); + printf("gpu result:\n"); + Print1DTensor(tcc); + for (index_t i = 0; i < tc.size(0); ++i) { + assert(abs(tcc[i] - tc[i]) < EPS); + } + FreeSpace(&tcc); + return true; +} +#endif diff --git a/test/unpack.cu b/test/unpack.cu new file mode 100644 index 000000000000..dd0c2b9c5821 --- /dev/null +++ b/test/unpack.cu @@ -0,0 +1,85 @@ +#include "mshadow/tensor.h" +#include "old/tensor.h" +#include "assert.h" +#include + +using mshadow::index_t; +template +void Print(T const & ist) { + for (int i = 0; i < ist.size(0); ++i) { + for (int j = 0; j < ist.size(1); ++j) { + printf("%.2f ", ist[i][j]); + } + printf("\n"); + } +} + +bool Check(mshadow::TensorContainer &mct, \ + Xmshadow::TensorContainer &xct) { + for (index_t i = 0; i < mct.size(0); ++i) { + for (index_t j = 0; j < mct.size(1); ++j) { + assert(mct[i][j] == xct[i][j]); + } + } + return true; +} + +template +void RunTask() { + const int ksize = 3; + const int kstride = 2; + const int X = 6; + Xmshadow::TensorContainer xsrc(Xmshadow::Shape4(1, 1, X, X)); + mshadow::TensorContainer src(mshadow::Shape4(1, 1, X, X)); + + for (int i = 0; i < X; ++i) { + for (int j = 0; j < X; ++j) { + xsrc[0][0][i][j] = i * 0.1f + j * 0.2f; + src[0][0][i][j] = i * 0.1f + j * 0.2f; + } + } + Xmshadow::TensorContainer xin(Xmshadow::Shape4(1, 1, X, X)); + mshadow::TensorContainer in(mshadow::Shape4(1, 1, X, X)); + + mshadow::Copy(in, src); + Xmshadow::Copy(xin, xsrc); + + Xmshadow::TensorContainer xtmp_col; + mshadow::TensorContainer tmp_col; + + + index_t oheight = (in.size(2) - ksize)/kstride + 1; + index_t owidth = (in.size(3) - ksize)/kstride + 1; + index_t nbatch = in.size(0); + + + xtmp_col.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); + tmp_col.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth)); + xtmp_col = Xmshadow::expr::unpack_patch2col( xin, ksize, kstride ); + tmp_col = mshadow::expr::unpack_patch2col(in, ksize, ksize, kstride); + + Xmshadow::TensorContainer xtc; + mshadow::TensorContainer tc; + + xtc.Resize( Xmshadow::Shape2( xin.shape[2]*ksize*ksize, nbatch*oheight*owidth ) ); + tc.Resize(mshadow::Shape2(in.size(1)*ksize*ksize, nbatch*oheight*owidth)); + + mshadow::Copy(tc, tmp_col); + Xmshadow::Copy(xtc, xtmp_col); + if (Check(tc, xtc)) { + printf("Pass\n"); + } + +} + +int main(int argc, char** argv) { + if (argc < 2) { + printf("Usage: dev\n"); + exit(-1); + } + if (!strcmp(argv[1], "cpu")) { + RunTask(); + } else { + RunTask(); + } +} \ No newline at end of file