From 1286809a1fc76c0b808b988084fc0950300f40d4 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Wed, 16 Aug 2017 18:11:01 -0700 Subject: [PATCH 001/448] Change git clone to specific tag for installation (#7502) --- docs/build_version_doc/AddVersion.py | 9 +++++++++ docs/get_started/install.md | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/build_version_doc/AddVersion.py b/docs/build_version_doc/AddVersion.py index c48c630565b7..44959445edc0 100755 --- a/docs/build_version_doc/AddVersion.py +++ b/docs/build_version_doc/AddVersion.py @@ -86,6 +86,15 @@ outstr = outstr.replace('http://mxnet.io', 'https://mxnet.incubator.apache.org/' 'versions/%s' % (args.current_version)) + # Fix git clone to specific tag + if args.current_version == 'master': + outstr = outstr.replace('git clone --recursive https://github.com/dmlc/mxnet', + 'git clone --recursive https://github.com/apache/incubator-mxnet.git') + else: + outstr = outstr.replace('git clone --recursive https://github.com/dmlc/mxnet', + 'git clone --recursive https://github.com/apache/incubator-mxnet.git ' + '--branch %s' % (args.current_version)) + with open(os.path.join(path, name), "w") as outf: outf.write(outstr) diff --git a/docs/get_started/install.md b/docs/get_started/install.md index 0e88a0d2a2ee..2ab771d4cfef 100644 --- a/docs/get_started/install.md +++ b/docs/get_started/install.md @@ -2,7 +2,7 @@ Indicate your preferred configuration. Then, follow the customized commands to install *MXNet*. - + From 462dee7a1547367c8ab1aaa786d5b59f210788de Mon Sep 17 00:00:00 2001 From: Peiyun Hu Date: Thu, 17 Aug 2017 15:43:47 -0400 Subject: [PATCH 002/448] Fix description of argument parser (#7507) --- example/image-classification/train_imagenet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py index 5760a9af3782..f465fbc5f469 100644 --- a/example/image-classification/train_imagenet.py +++ b/example/image-classification/train_imagenet.py @@ -25,7 +25,7 @@ if __name__ == '__main__': # parse args - parser = argparse.ArgumentParser(description="train cifar10", + parser = argparse.ArgumentParser(description="train imagenet-1k", formatter_class=argparse.ArgumentDefaultsHelpFormatter) fit.add_fit_args(parser) data.add_data_args(parser) From 56eae588c097f12035356333b742489d8cf0eaae Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 17 Aug 2017 21:14:18 -0700 Subject: [PATCH 003/448] Fixed Makefile so a null CUDA_ARCH is treated like an unset one. (#7515) --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 33151e574ea7..b6c5834a566b 100644 --- a/Makefile +++ b/Makefile @@ -166,8 +166,8 @@ endif # Sets 'CUDA_ARCH', which determines the GPU architectures supported # by the compiled kernels. Users can edit the KNOWN_CUDA_ARCHS list below -# to remove archs they don't wish to support to speed compilation, or they -# can pre-set the CUDA_ARCH args in config.mk for full control. +# to remove archs they don't wish to support to speed compilation, or they can +# pre-set the CUDA_ARCH args in config.mk to a non-null value for full control. # # For archs in this list, nvcc will create a fat-binary that will include # the binaries (SASS) for all architectures supported by the installed version @@ -175,7 +175,7 @@ endif # If these kernels are then run on a newer-architecture GPU, the binary will # be JIT-compiled by the updated driver from the included PTX. ifeq ($(USE_CUDA), 1) -ifeq ($(origin CUDA_ARCH), undefined) +ifeq ($(CUDA_ARCH),) KNOWN_CUDA_ARCHS := 30 35 50 52 60 61 70 # Run nvcc on a zero-length file to check architecture-level support. # Create args to include SASS in the fat binary for supported levels. From ff21e1fd41f118dbbaf55d8f02a9669842ef565f Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 17 Aug 2017 21:16:51 -0700 Subject: [PATCH 004/448] Changed FullyConnected to use new linalg gemm, plus TensorCore if fp16 I/O. (#7505) * Converted FullyConnected to use new linalg gemm, plus TensorCore if fp16 I/O. * Simplified linalg_gemm interface to ease integration. * Correcting code in response to comments. * Removing Transpose(), leaving trailing req arg with default of kWriteTo. --- src/common/cuda_utils.h | 34 ++++++++--- src/operator/fully_connected-inl.h | 14 +++-- src/operator/linalg.h | 10 ++++ src/operator/linalg_impl.h | 86 +++++++++++++++++++++++++++ tests/python/gpu/test_operator_gpu.py | 5 ++ 5 files changed, 138 insertions(+), 11 deletions(-) diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h index 483390fc9bea..0213c73177b3 100644 --- a/src/common/cuda_utils.h +++ b/src/common/cuda_utils.h @@ -200,7 +200,7 @@ inline DType __device__ CudaMin(DType a, DType b) { { \ cublasStatus_t e = (func); \ CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \ - << "cuBLAS: " << common::cuda::CublasGetErrorString(e); \ + << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \ } /*! @@ -213,7 +213,7 @@ inline DType __device__ CudaMin(DType a, DType b) { { \ cusolverStatus_t e = (func); \ CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS) \ - << "cuSolver: " << common::cuda::CusolverGetErrorString(e); \ + << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \ } /*! @@ -226,7 +226,7 @@ inline DType __device__ CudaMin(DType a, DType b) { { \ curandStatus_t e = (func); \ CHECK_EQ(e, CURAND_STATUS_SUCCESS) \ - << "cuRAND: " << common::cuda::CurandGetErrorString(e); \ + << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \ } #if !defined(_MSC_VER) @@ -304,11 +304,31 @@ inline bool SupportsTensorCore(int device_id) { * \return whether to allow TensorCore algo (if not specified by the Operator locally). */ inline bool GetEnvAllowTensorCore() { - // Use of optional here permits: "0", "1", "true" and "false" to all be legal. - bool default_value = MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT; - return dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE", - dmlc::optional(default_value)).value(); + // Since these statics are in the '.h' file, they will exist and will be set + // separately in each compilation unit. Not ideal, but cleaner than creating a + // cuda_utils.cc solely to have a single instance and initialization. + static bool allow_tensor_core = false; + static bool is_set = false; + if (!is_set) { + // Use of optional here permits: "0", "1", "true" and "false" to all be legal. + bool default_value = MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT; + allow_tensor_core = dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE", + dmlc::optional(default_value)).value(); + is_set = true; + } + return allow_tensor_core; +} + +#if CUDA_VERSION >= 9000 +// Sets the cuBLAS math mode that determines the 'allow TensorCore' policy. Returns previous. +inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) { + auto handle_math_mode = CUBLAS_DEFAULT_MATH; + CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode)); + CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type)); + return handle_math_mode; } +#endif + #endif // MXNET_USE_CUDA #if MXNET_USE_CUDNN diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h index e2fab9f1f7dd..cf13655d9c97 100644 --- a/src/operator/fully_connected-inl.h +++ b/src/operator/fully_connected-inl.h @@ -33,7 +33,7 @@ #include #include "./operator_common.h" #include "./elemwise_op_common.h" - +#include "linalg.h" namespace mxnet { namespace op { @@ -96,7 +96,9 @@ class FullyConnectedOp : public Operator { Tensor wmat = in_data[fullc::kWeight].get(s); Tensor out = out_data[fullc::kOut].get_with_shape( Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - out = dot(data, wmat.T()); + // Legacy approach shown here for comparison: + // out = dot(data, wmat.T()); + linalg_gemm(data, wmat, out, false, true, s); if (!param_.no_bias) { Tensor bias = in_data[fullc::kBias].get(s); out += repmat(bias, data.size(0)); @@ -136,7 +138,9 @@ class FullyConnectedOp : public Operator { CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; // gradient of weight Tensor gwmat = in_grad[fullc::kWeight].get(s); - Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); + // Legacy approach shown here for comparison: + // Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); + linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); // gradient of bias if (!param_.no_bias) { Tensor gbias = in_grad[fullc::kBias].get(s); @@ -145,7 +149,9 @@ class FullyConnectedOp : public Operator { // gradient of data Tensor gdata = in_grad[fullc::kData].get_with_shape( Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - Assign(gdata, req[fullc::kData], dot(grad, wmat)); + // Legacy approach shown here for comparison: + // Assign(gdata, req[fullc::kData], dot(grad, wmat)); + linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); } private: diff --git a/src/operator/linalg.h b/src/operator/linalg.h index 9284a5825d2c..76acf7b98f41 100644 --- a/src/operator/linalg.h +++ b/src/operator/linalg.h @@ -26,6 +26,8 @@ #define MXNET_OPERATOR_LINALG_H_ #include +#include + #include "./c_lapack_api.h" using namespace mshadow; @@ -62,6 +64,14 @@ void linalg_batch_gemm(const Tensor& A, const Tensor& C, DType alpha, DType beta, bool tA, bool tB, Stream *s = 0); +template +inline void linalg_gemm(const Tensor& A, + const Tensor& B, + const Tensor& C, + bool tA, bool tB, + Stream *s = 0, + mxnet::OpReqType req = mxnet::kWriteTo); + //////////////////////////////// TRSM //////////////////////////////////////////// // CPU/GPU-versions of BLAS3 function "trsm". Please refer to the BLAS3-documentation diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h index affa7941640b..1e3b0e66e641 100644 --- a/src/operator/linalg_impl.h +++ b/src/operator/linalg_impl.h @@ -25,8 +25,12 @@ #ifndef MXNET_OPERATOR_LINALG_IMPL_H_ #define MXNET_OPERATOR_LINALG_IMPL_H_ +#include + #include +#include "../common/cuda_utils.h" + // Convenience functions. inline void linalg_check_batch_size(int A, int B, int C) { CHECK_EQ(A, B) << "Inconsistent batch size between arguments to linear algebra operator"; @@ -108,6 +112,55 @@ void linalg_gemm(const Tensor& A, const Tensor for DType=mshadow::half::half_t. +template<> inline +void linalg_gemm(const Tensor& A, + const Tensor& B, + const Tensor& C, + mshadow::half::half_t alpha, + mshadow::half::half_t beta, + bool tA, bool tB, Stream *s) { + using namespace mxnet; + using mshadow::gpu; + CHECK_NOTNULL(s); + check_gemm(A, B, C, alpha, beta, tA, tB); + +#if CUDA_VERSION >= 7050 + auto blas_handle = Stream::GetBlasHandle(s); +#if CUDA_VERSION >= 9000 + auto cublas_math_mode = GetEnvAllowTensorCore() ? CUBLAS_TENSOR_OP_MATH + : CUBLAS_DEFAULT_MATH; + auto previous_math_mode = SetCublasMathMode(blas_handle, cublas_math_mode); +#endif + + // pseudo-fp16 (fp32 math with fp16 I/O) + float alpha_f = float(alpha); // NOLINT(*) + float beta_f = float(beta); // NOLINT(*) + + // As of cuda8, cublas adopted the cuda datatype, rather than maintaining its own datatype. +#if CUDA_VERSION >= 8000 + cudaDataType_t half_datatype = CUDA_R_16F; +#else + cublasDataType_t half_datatype = CUBLAS_DATA_HALF; +#endif + CUBLAS_CALL(cublasSgemmEx(blas_handle, + (tB ? CUBLAS_OP_T : CUBLAS_OP_N), + (tA ? CUBLAS_OP_T : CUBLAS_OP_N), + C.size(1), C.size(0), (tB ? B.size(1) : B.size(0)), + &alpha_f, + B.dptr_, half_datatype, B.stride_, + A.dptr_, half_datatype, A.stride_, + &beta_f, + C.dptr_, half_datatype, C.stride_)); +#if CUDA_VERSION >= 9000 + SetCublasMathMode(blas_handle, previous_math_mode); +#endif +#else + LOG(FATAL) << "FP16 gemm requires CUDA version >= 7.5!"; +#endif // CUDA_VERSION >= 7050 +} + + #define LINALG_GPU_BATCH_GEMM(fname, DType) \ template<> inline \ void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ @@ -246,6 +299,39 @@ LINALG_GPU_BATCH_TRSM(DtrsmBatched, double) #endif +/*! + * \brief Performs gemm, setting alpha and beta as appropriate for `req`. + * + * \param A the first operand of the gemm + * \param B the second operand of the gemm + * \param C the data to be assigned + * \tparam tA whether the `A` operand should be transposed first. + * \tparam tB whether the `B` operand should be transposed first. + * \tparam s the stream to perform the operation + * \param req the assignment request + */ +template +inline void linalg_gemm(const Tensor& A, + const Tensor& B, + const Tensor& C, + bool tA, bool tB, Stream *s, + mxnet::OpReqType req) { + using namespace mxnet; + switch (req) { + case kNullOp: + break; + case kWriteTo: + case kWriteInplace: + linalg_gemm(A, B, C, DType(1.0), DType(0.0), tA, tB, s); + break; + case kAddTo: + linalg_gemm(A, B, C, DType(1.0), DType(1.0), tA, tB, s); + break; + default: + LOG(FATAL) << "not reached"; + } +} + //////////////////////////////// TRMM //////////////////////////////////////////// // CPU/GPU-versions of BLAS3 function "trmm". Please refer to the BLAS3-documentation diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 866f6ad8abc0..81492fe6bbdb 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -926,6 +926,11 @@ def test_fullyconnected_with_type(): {'ctx': mx.cpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float64}}, {'ctx': mx.cpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float32}}] check_consistency(sym, ctx_list) + # Sizes are divisible by 8 to test TensorCore on Volta GPU. + sym = mx.sym.FullyConnected(num_hidden=8, name='inner') + ctx_list = [{'ctx': mx.gpu(0), 'inner_data': (16, 24), 'type_dict': {'inner_data': np.float16}}, + {'ctx': mx.cpu(0), 'inner_data': (16, 24), 'type_dict': {'inner_data': np.float32}}] + check_consistency(sym, ctx_list) def test_activation_with_type(): From 6004e529320a3230607ff535d2ec16190130959d Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Thu, 17 Aug 2017 21:20:29 -0700 Subject: [PATCH 005/448] Modify pip install to specific tag (#7514) * Modify pip installation to specific tag * Fix --- docs/build_version_doc/AddVersion.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/build_version_doc/AddVersion.py b/docs/build_version_doc/AddVersion.py index 44959445edc0..e5569a0fdaac 100755 --- a/docs/build_version_doc/AddVersion.py +++ b/docs/build_version_doc/AddVersion.py @@ -86,14 +86,25 @@ outstr = outstr.replace('http://mxnet.io', 'https://mxnet.incubator.apache.org/' 'versions/%s' % (args.current_version)) - # Fix git clone to specific tag + # Fix git clone and pip installation to specific tag + pip_pattern = ['', '-cu80', '-cu75', '-cu80mkl', '-cu75mkl', '-mkl'] if args.current_version == 'master': outstr = outstr.replace('git clone --recursive https://github.com/dmlc/mxnet', 'git clone --recursive https://github.com/apache/incubator-mxnet.git') + for trail in pip_pattern: + outstr = outstr.replace('pip install mxnet%s<' % (trail), + 'pip install mxnet%s --pre<' % (trail)) + outstr = outstr.replace('pip install mxnet%s\n<' % (trail), + 'pip install mxnet%s --pre\n<' % (trail)) else: outstr = outstr.replace('git clone --recursive https://github.com/dmlc/mxnet', 'git clone --recursive https://github.com/apache/incubator-mxnet.git ' '--branch %s' % (args.current_version)) + for trail in pip_pattern: + outstr = outstr.replace('pip install mxnet%s<' % (trail), + 'pip install mxnet%s==%s<' % (trail, args.current_version)) + outstr = outstr.replace('pip install mxnet%s\n<' % (trail), + 'pip install mxnet%s==%s\n<' % (trail, args.current_version)) with open(os.path.join(path, name), "w") as outf: outf.write(outstr) From d2dbffe194f4eed728c883927ce639919cb2078f Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Fri, 18 Aug 2017 02:27:31 -0700 Subject: [PATCH 006/448] [scala-package][spark] fix example script (#7411) * temp * temp * fix example script * update indent --- .gitignore | 2 + scala-package/spark/bin/run-mnist-example.sh | 59 ++++++++++++-------- scala-package/spark/pom.xml | 2 +- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 82d2e560237d..4e4ff78d3489 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,5 @@ bld target bin/im2rec + +model/ \ No newline at end of file diff --git a/scala-package/spark/bin/run-mnist-example.sh b/scala-package/spark/bin/run-mnist-example.sh index cae19386a8ee..57b8a1803363 100755 --- a/scala-package/spark/bin/run-mnist-example.sh +++ b/scala-package/spark/bin/run-mnist-example.sh @@ -18,47 +18,62 @@ # under the License. CURR_DIR=$(cd `dirname $0`; pwd) -MODULE_DIR=$(cd $CURR_DIR/../; pwd) -ROOT_DIR=$(cd $CURR_DIR/../../; pwd) +SPARK_MODULE_DIR=$(cd $CURR_DIR/../; pwd) +SCALA_PKG_DIR=$(cd $CURR_DIR/../../; pwd) +OS="" -LIB_DIR=${MODULE_DIR}/target/classes/lib -JAR=${MODULE_DIR}/target/mxnet-spark_2.10-0.1.2-SNAPSHOT.jar +if [ "$(uname)" == "Darwin" ]; then + # Do something under Mac OS X platform + OS='osx-x86_64-cpu' +elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then + OS='linux-x86_64-cpu' +fi -LIBS=${ROOT_DIR}/assembly/linux-x86_64-cpu/target/mxnet-full_2.10-linux-x86_64-cpu-0.1.2-SNAPSHOT.jar -LIBS="${LIBS},${LIB_DIR}/args4j-2.0.29.jar,${LIB_DIR}/scala-library-2.10.4.jar,${JAR}" +LIB_DIR=${SPARK_MODULE_DIR}/target/classes/lib +SPARK_JAR=`find ${SPARK_MODULE_DIR}/target -name "*.jar" -type f -exec ls "{}" + | grep -v -E '(javadoc|sources)'` +SCALA_JAR=`find ${SCALA_PKG_DIR}/assembly/$OS/target -maxdepth 1 -name "*.jar" -type f -exec ls "{}" + | grep -v -E '(javadoc|sources)'` -SPARK_OPTS+=" --name mxnet" +SPARK_OPTS+=" --name mxnet-spark-mnist" SPARK_OPTS+=" --driver-memory 1g" SPARK_OPTS+=" --executor-memory 1g" SPARK_OPTS+=" --num-executors 2" SPARK_OPTS+=" --executor-cores 1" -SPARK_OPTS+=" --jars ${LIBS}" +SPARK_OPTS+=" --jars ${SCALA_JAR}" -# You can download these two files as training & validation set. -# They were converted from the MNIST dataset, -# in which each sample was simply flatterned to an array of floats. -# https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/train.txt -# https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/val.txt +# Download training and test set +if [ ! -f ./train.txt ]; then + wget https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/train.txt +fi + +if [ ! -f ./val.txt ]; then + wget https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/val.txt +fi # running opts -RUN_OPTS+=" --input ${INPUT_TRAIN}" -RUN_OPTS+=" --input-val ${INPUT_VAL}" -RUN_OPTS+=" --output ${OUTPUT}" +RUN_OPTS+=" --input train.txt" +RUN_OPTS+=" --input-val val.txt" +RUN_OPTS+=" --output ./" # These jars are required by the KVStores at runtime. # They will be uploaded and distributed to each node automatically. -RUN_OPTS+=" --jars ${LIBS}" +RUN_OPTS+=" --jars $SCALA_JAR,$SPARK_JAR" RUN_OPTS+=" --num-server 1" RUN_OPTS+=" --num-worker 2" -RUN_OPTS+=" --java /usr/local/jdk1.8.0_60/bin/java" +RUN_OPTS+=" --java $JAVA_HOME/bin/java" RUN_OPTS+=" --model mlp" RUN_OPTS+=" --cpus 0,1" RUN_OPTS+=" --num-epoch 5" -${SPARK_HOME}/bin/spark-submit --master spark://localhost:7077 \ - --conf spark.dynamicAllocation.enabled=false \ - --conf spark.speculation=false \ +# check if SPARK_HOME is set +if [ -z "$SPARK_HOME" ]; then + echo "SPARK_HOME is unset"; + exit 1 +fi + +HOST=`hostname` + +$SPARK_HOME/bin/spark-submit --master spark://$HOST:7077 \ --class ml.dmlc.mxnet.spark.example.ClassificationExample \ ${SPARK_OPTS} \ - ${JAR} \ + ${SPARK_JAR} \ ${RUN_OPTS} diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml index 18170b95579b..c59662f6debc 100644 --- a/scala-package/spark/pom.xml +++ b/scala-package/spark/pom.xml @@ -14,7 +14,7 @@ MXNet Scala Package - Spark ML - 1.6.1 + 1.6.3 From 406bc198538c904d6105ffafe4cc230d0c858545 Mon Sep 17 00:00:00 2001 From: buryang <419494197@163.com> Date: Sat, 19 Aug 2017 00:41:06 +0800 Subject: [PATCH 007/448] V0.11.0 (#7518) * Update NOTICE & README * New code signing key & README file changes (#7464) * add Naveen's Code Signing Key (#7460) * Updating CoreML readme file (#7459) * Fixing CoreML converter's README: typos/grammar/etc. * CoreML converter README update: Talk about layers first and then about models. * Providing examples on converting various standard models; calling out issues with InceptionV3. * Change RC version in NEWS (#7467) * add Naveen's Code Signing Key (#7460) * Updating CoreML readme file (#7459) * Fixing CoreML converter's README: typos/grammar/etc. * CoreML converter README update: Talk about layers first and then about models. * Providing examples on converting various standard models; calling out issues with InceptionV3. * Update NEWS * Update NEWS * update to rc2 --- NEWS.md | 2 +- NOTICE | 7 +++++-- README.md | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4fdd31430002..4f1ecd15689c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ MXNet Change Log ================ -## 0.11.0-rc0 +## 0.11.0-rc2 ### - Major Features - Apple Core ML model converter - Support for Keras v1.2.2 diff --git a/NOTICE b/NOTICE index 2051e3c00d53..03695607e3e9 100644 --- a/NOTICE +++ b/NOTICE @@ -1,2 +1,5 @@ -MXNet -Copyright (c) 2015-2016 by Contributors +Apache MXNET (incubating) +Copyright [2015-2017] The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). diff --git a/README.md b/README.md index 841c6f1f62c2..a11780aa019b 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ deep learning systems, and interesting insights of DL systems for hackers. What's New ---------- -* [Version 0.11.0-rc0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.11.0.rc0) - MXNet 0.11.0-rc0 Release. +* [Version 0.11.0-rc2 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.11.0.rc2) - MXNet 0.11.0-rc2 Release. * [Apache Incubator](http://incubator.apache.org/projects/mxnet.html) - We are now an Apache Incubator project. * [Version 0.10.0 Release](https://github.com/dmlc/mxnet/releases/tag/v0.10.0) - MXNet 0.10.0 Release. * [Version 0.9.3 Release](./docs/architecture/release_note_0_9.md) - First 0.9 official release. From 5ee1cfe96852eb695f933dfe083e0429d97b704e Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Fri, 18 Aug 2017 20:35:52 -0700 Subject: [PATCH 008/448] Update README.md for easy copy and paste --- scala-package/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scala-package/spark/README.md b/scala-package/spark/README.md index 974691650ff4..06106648c059 100644 --- a/scala-package/spark/README.md +++ b/scala-package/spark/README.md @@ -11,7 +11,7 @@ The MXNet on Spark is still in *experimental stage*. Any suggestion or contribut Build ------------ -Checkout the [Installation Guide](http://mxnet.io/get_started/setup.html) contains instructions to install mxnet. Remember to enable the distributed training, i.e., set `USE_DIST_KVSTORE = 1`. +Checkout the [Installation Guide](http://mxnet.io/get_started/setup.html) contains instructions to install mxnet. Remember to enable the distributed training, i.e., set `USE_DIST_KVSTORE=1`. Compile the Scala Package by From 0efc326e2243625d622a43287bf15c62e6afd1b0 Mon Sep 17 00:00:00 2001 From: Ziyue Huang Date: Mon, 21 Aug 2017 05:43:02 +0800 Subject: [PATCH 009/448] Fix a bug in SequentialRNNCell.reset() (#7449) * remove self-implemented speedometer * fix bug in SequentialRNNCell.reset * Revert "remove self-implemented speedometer" This reverts commit 17aa4c0887c099f22c4769de079ef0130ed5f3e8. * fix lint * fix * fix reset in origin rnn and gluon rnn * fix origin rnn --- python/mxnet/gluon/rnn/rnn_cell.py | 2 ++ python/mxnet/rnn/rnn_cell.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/python/mxnet/gluon/rnn/rnn_cell.py b/python/mxnet/gluon/rnn/rnn_cell.py index c9186fd3ce09..eb67fd73db7d 100644 --- a/python/mxnet/gluon/rnn/rnn_cell.py +++ b/python/mxnet/gluon/rnn/rnn_cell.py @@ -121,6 +121,8 @@ def reset(self): """Reset before re-using the cell for another graph.""" self._init_counter = -1 self._counter = -1 + for cell in self._children: + cell.reset() def state_info(self, batch_size=0): """shape and layout information of states""" diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py index 1c3452041494..b2bf107c38ca 100644 --- a/python/mxnet/rnn/rnn_cell.py +++ b/python/mxnet/rnn/rnn_cell.py @@ -134,6 +134,9 @@ def reset(self): """Reset before re-using the cell for another graph.""" self._init_counter = -1 self._counter = -1 + if hasattr(self, '_cells'): + for cell in self._cells: + cell.reset() def __call__(self, inputs, states): """Unroll the RNN for one time step. From 686153e59a6c87f33f8f93297c8326516e74c964 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 22 Aug 2017 02:00:54 +0800 Subject: [PATCH 010/448] Fix argsort + Update MShadow (#7535) * fix the topk related operators to use int for indexing and use CUB for sorting. * Update MShadow * fix bug * fix lint --- mshadow | 2 +- src/operator/tensor/init_op.h | 7 +-- src/operator/tensor/ordering_op-inl.h | 76 ++++++++++++++++----------- 3 files changed, 50 insertions(+), 35 deletions(-) diff --git a/mshadow b/mshadow index 497eb9180b24..6d75df228978 160000 --- a/mshadow +++ b/mshadow @@ -1 +1 @@ -Subproject commit 497eb9180b24592b7332e7e08f2c053ec5346524 +Subproject commit 6d75df228978ca5f182dd707578ef704099ab5ee diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h index bdc74d332491..30a5a3a3af1b 100644 --- a/src/operator/tensor/init_op.h +++ b/src/operator/tensor/init_op.h @@ -186,9 +186,10 @@ inline bool RangeShape(const nnvm::NodeAttrs& attrs, << "(" << param.start << "," << param.stop.value() << "," << param.step << ")"; } SHAPE_ASSIGN_CHECK(*out_attrs, 0, - mshadow::Shape1(param.repeat * - ceil((param.stop.value() - - param.start) / param.step))); + mshadow::Shape1(mshadow::expr::RangeOutSize(param.start, + param.stop.value(), + param.step, + param.repeat))); return true; } diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h index eb28b010cbd3..560554151a19 100644 --- a/src/operator/tensor/ordering_op-inl.h +++ b/src/operator/tensor/ordering_op-inl.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "../mshadow_op.h" @@ -175,8 +176,10 @@ void TopKImpl(RunContext ctx, } // 1. Parse and initialize information Stream *s = ctx.get_stream(); - Tensor workspace; - Tensor sorted_dat, indices, batch_id, sel_indices; + Tensor workspace; + Tensor temp_workspace; + Tensor sorted_dat; + Tensor indices, batch_id, sel_indices; Tensor mask_val; int batch_size, element_num; // number of batches + the size of each batch int axis = 0; @@ -187,49 +190,58 @@ void TopKImpl(RunContext ctx, ParseTopKParam(src.shape_, param, &target_shape, &batch_size, &element_num, &axis, &k, &do_transpose, &is_ascend); Tensor dat = src.FlatTo3D(axis, axis, s); + size_t temp_size = mxnet::op::SortByKeyWorkspaceSize(src.Size()); + temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize(src.Size())); + temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize(src.Size())); + size_t workspace_size = temp_size + sizeof(real_t) * src.Size() + sizeof(int) * src.Size() * 2; if (param.ret_typ == topk_enum::kReturnMask) { - workspace = - resource.get_space_typed(Shape1(src.Size() * 3 + 2 * batch_size * k), s); - } else { - workspace = resource.get_space_typed(mshadow::Shape1(src.Size() * 3), s); + workspace_size += sizeof(int) * batch_size * k + sizeof(real_t) * batch_size * k; } - sorted_dat = Tensor(workspace.dptr_, + workspace = resource.get_space_typed(Shape1(workspace_size), s); + char* workspace_curr_ptr = workspace.dptr_; + sorted_dat = Tensor(reinterpret_cast(workspace_curr_ptr), Shape1(src.Size()), s); // contain sorted dat - indices = Tensor(workspace.dptr_ + src.Size(), - Shape1(src.Size()), s); // indices in the original matrix - batch_id = Tensor(workspace.dptr_ + 2 * src.Size(), - Shape1(src.Size()), s); // batch id in the original matrix + workspace_curr_ptr += sizeof(real_t) * src.Size(); + indices = Tensor(reinterpret_cast(workspace_curr_ptr), + Shape1(src.Size()), s); // indices in the original matrix + workspace_curr_ptr += sizeof(int) * src.Size(); + batch_id = Tensor(reinterpret_cast(workspace_curr_ptr), + Shape1(src.Size()), s); // batch id in the original matrix + workspace_curr_ptr += sizeof(int) * src.Size(); if (do_transpose) { sorted_dat = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size())); } else { sorted_dat = reshape(dat, Shape1(src.Size())); } - indices = range(0, batch_size * element_num); + indices = range(0, batch_size * element_num); CHECK_EQ(sorted_dat.CheckContiguous(), true); CHECK_EQ(indices.CheckContiguous(), true); if (param.ret_typ == topk_enum::kReturnMask) { - sel_indices = Tensor(workspace.dptr_ + 3 * src.Size(), - Shape1(batch_size * k), s); - mask_val = Tensor(workspace.dptr_ + 3 * src.Size() + batch_size * k, + sel_indices = Tensor(reinterpret_cast(workspace_curr_ptr), + Shape1(batch_size * k), s); + workspace_curr_ptr += sizeof(int) * batch_size * k; + mask_val = Tensor(reinterpret_cast(workspace_curr_ptr), Shape2(batch_size * k, 1), s); + workspace_curr_ptr += sizeof(real_t) * batch_size * k; mask_val = scalar(1); CHECK_EQ(sel_indices.CheckContiguous(), true); CHECK_EQ(mask_val.CheckContiguous(), true); } - + temp_workspace = Tensor(workspace_curr_ptr, Shape1(temp_size), s); // temp space + workspace_curr_ptr += temp_size; // 2. Perform inplace batch sort using the `SortByKey` in MShadow // After sorting, each batch in `sorted_dat` will be sorted in the corresponding order // and the `indices` will contain the corresponding index in `sorted_dat` // Sort the data and keep record of the correspondence to global indices. - mxnet::op::SortByKey(sorted_dat, indices, is_ascend); + mxnet::op::SortByKey(sorted_dat, indices, is_ascend, &temp_workspace); // Calculate the corresponding batch indices of the elements - batch_id = F(indices / static_cast(element_num)); + batch_id = indices / element_num; // Since the SortByKey performs stable sort, the second SortByKey will reorder // the sorted_dat based on the order of the batch_id - mxnet::op::SortByKey(batch_id, sorted_dat, true); + mxnet::op::SortByKey(batch_id, sorted_dat, true, &temp_workspace); // Reorder the indices - batch_id = F(indices / static_cast(element_num)); - mxnet::op::SortByKey(batch_id, indices, true); + batch_id = indices / element_num; + mxnet::op::SortByKey(batch_id, indices, true, &temp_workspace); // 3. Assign results to the ret blob if (param.ret_typ == topk_enum::kReturnMask) { @@ -239,8 +251,8 @@ void TopKImpl(RunContext ctx, sel_indices = reshape(slice<1>( inplace_reshape(indices, Shape2(batch_size, - element_num)), 0, k), - Shape1(batch_size * k)); + element_num)), 0, k), + Shape1(batch_size * k)); if (do_transpose) { TShape src_shape = src.shape_.FlatTo3D(axis); CHECK_EQ(sel_indices.CheckContiguous(), true); @@ -249,23 +261,24 @@ void TopKImpl(RunContext ctx, } IndexFill(ret_mask, sel_indices, mask_val); } else if (param.ret_typ == topk_enum::kReturnIndices) { - indices -= batch_id * static_cast(element_num); + indices -= batch_id * element_num; if (do_transpose) { Tensor ret_indices = ret[0].FlatTo3D(axis, axis, s); - ret_indices = transpose( + ret_indices = tcast(transpose( slice<2>(inplace_reshape(indices, Shape3(ret_indices.shape_[0], ret_indices.shape_[2], element_num)), 0, k), - Shape3(0, 2, 1)); + Shape3(0, 2, 1))); } else { Tensor ret_indices = ret[0].get_with_shape(Shape2(batch_size, k), s); - ret_indices = slice<1>(inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k); + ret_indices = tcast(slice<1>( + inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k)); } } else { - indices -= batch_id * static_cast(element_num); + indices -= batch_id * element_num; if (do_transpose) { Tensor ret_value = ret[0].FlatTo3D(axis, axis, s); Tensor ret_indices = ret[1].FlatTo3D(axis, axis, s); @@ -274,20 +287,21 @@ void TopKImpl(RunContext ctx, Shape3(ret_value.shape_[0], ret_value.shape_[2], element_num)), 0, k), Shape3(0, 2, 1)); - ret_indices = transpose( + ret_indices = tcast(transpose( slice<2>(inplace_reshape(indices, Shape3(ret_indices.shape_[0], ret_indices.shape_[2], element_num)), 0, k), - Shape3(0, 2, 1)); + Shape3(0, 2, 1))); } else { Tensor ret_value = ret[0].get_with_shape(Shape2(batch_size, k), s); Tensor ret_indices = ret[1].get_with_shape(Shape2(batch_size, k), s); ret_value = slice<1>(inplace_reshape(sorted_dat, Shape2(batch_size, element_num)), 0, k); - ret_indices = slice<1>(inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k); + ret_indices = tcast(slice<1>( + inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k)); } } } From 32fc60b29d00201abe6d55ce0b751b25398b1d46 Mon Sep 17 00:00:00 2001 From: Eric Junyuan Xie Date: Mon, 21 Aug 2017 11:44:52 -0700 Subject: [PATCH 011/448] Update Jenkinsfile (#7541) --- Jenkinsfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 95115cf58920..bf237a589c99 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -154,7 +154,8 @@ try { node('mxnetlinux') { ws('workspace/amalgamation') { init_git() - make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1') + make('cpu', '-C amalgamation/ clean') + make('cpu', '-C amalgamation/ USE_BLAS=openblas') } } }, From ab1486704a7e0fdf1c7b0619306f4899a6f1e8de Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Mon, 21 Aug 2017 16:15:55 -0700 Subject: [PATCH 012/448] Move usage of persistent BN to cuDNN 7.0.3 (#7543) --- src/operator/cudnn_batch_norm-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h index b0c5f43157d0..258bed5ea326 100644 --- a/src/operator/cudnn_batch_norm-inl.h +++ b/src/operator/cudnn_batch_norm-inl.h @@ -112,7 +112,7 @@ class CuDNNBatchNormOp : public Operator { Tensor y = out_data[cudnnbatchnorm::kOut].get_with_shape(shape_, s); -#if CUDNN_VERSION >= 7000 +#if CUDNN_VERSION >= 7003 auto mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #else auto mode = CUDNN_BATCHNORM_SPATIAL; @@ -201,7 +201,7 @@ class CuDNNBatchNormOp : public Operator { out_grad[cudnnbatchnorm::kOut].get_with_shape(shape_, s); #if CUDNN_VERSION >= 4007 -#if CUDNN_VERSION >= 7000 +#if CUDNN_VERSION >= 7003 auto mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #else auto mode = CUDNN_BATCHNORM_SPATIAL; From 9796134077891de3d13e6774adb4db81a0a2ecb6 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Mon, 21 Aug 2017 16:17:26 -0700 Subject: [PATCH 013/448] 0.11.1 (#7542) --- R-package/DESCRIPTION | 2 +- include/mxnet/base.h | 2 +- python/mxnet/libinfo.py | 2 +- scala-package/pom.xml | 2 +- snapcraft.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index e0b435513718..9d2951c0090c 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,7 +1,7 @@ Package: mxnet Type: Package Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems -Version: 0.11.0 +Version: 0.11.1 Date: 2017-06-27 Author: Tianqi Chen, Qiang Kou, Tong He Maintainer: Qiang Kou diff --git a/include/mxnet/base.h b/include/mxnet/base.h index 695408380ec9..61d105a5bc48 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -112,7 +112,7 @@ /*! \brief minor version */ #define MXNET_MINOR 11 /*! \brief patch version */ -#define MXNET_PATCH 0 +#define MXNET_PATCH 1 /*! \brief mxnet version */ #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH) /*! \brief helper for making version number */ diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py index 7da0dcfc8d2d..f0838d7028c4 100644 --- a/python/mxnet/libinfo.py +++ b/python/mxnet/libinfo.py @@ -61,4 +61,4 @@ def find_lib_path(): # current version -__version__ = "0.11.0" +__version__ = "0.11.1" diff --git a/scala-package/pom.xml b/scala-package/pom.xml index 7bfd8774de6b..d6be0996bad3 100644 --- a/scala-package/pom.xml +++ b/scala-package/pom.xml @@ -48,7 +48,7 @@ - 0.11.0-SNAPSHOT + 0.11.1-SNAPSHOT 2.11.8 2.11 diff --git a/snapcraft.yaml b/snapcraft.yaml index 27356c332a29..6d45746aff74 100644 --- a/snapcraft.yaml +++ b/snapcraft.yaml @@ -1,5 +1,5 @@ name: mxnet -version: '0.11.0' +version: '0.11.1' summary: MXNet is a deep learning framework designed for efficiency and flexibility. description: | MXNet is a deep learning framework designed for both efficiency and From e0607bcfeeb872eca4b66aea192e904ddd3ce061 Mon Sep 17 00:00:00 2001 From: mbaijal <30911248+mbaijal@users.noreply.github.com> Date: Mon, 21 Aug 2017 18:38:20 -0700 Subject: [PATCH 014/448] remove MXNet License from rcnn license (#7549) --- example/rcnn/LICENSE | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/example/rcnn/LICENSE b/example/rcnn/LICENSE index 749e0a16b363..ac015288d18c 100644 --- a/example/rcnn/LICENSE +++ b/example/rcnn/LICENSE @@ -15,23 +15,6 @@ See the License for the specific language governing permissions and limitations under the License. -MXNet - -Copyright (c) 2015-2016 by Contributors - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - - Fast R-CNN Copyright (c) Microsoft Corporation From f517d9d406525c4e109dd6fe10d9d175ed75d0db Mon Sep 17 00:00:00 2001 From: Chris Olivier Date: Tue, 22 Aug 2017 09:27:24 -0700 Subject: [PATCH 015/448] Fix optimizer parms in fit.py + Don't repeatedly call slow prepare_mkl.sh script (#7545) (#7547) * Only call MKL script once * Fix 'momentum' and 'multi_precision' optimizer args * fix working --- Makefile | 4 ++-- example/image-classification/common/fit.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index b6c5834a566b..300f901bd662 100644 --- a/Makefile +++ b/Makefile @@ -36,8 +36,8 @@ include $(config) ifeq ($(USE_MKL2017), 1) # must run ./prepare_mkl before including mshadow.mk - RETURN_STRING = $(shell ./prepare_mkl.sh $(MKLML_ROOT)) - MKLROOT = $(firstword $(RETURN_STRING)) + RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT)) + MKLROOT := $(firstword $(RETURN_STRING)) export USE_MKLML = $(lastword $(RETURN_STRING)) endif diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 73235fc2e4ef..dfec2a886b80 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -163,10 +163,17 @@ def fit(args, network, data_loader, **kwargs): lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, - 'momentum' : args.mom, 'wd' : args.wd, - 'lr_scheduler': lr_scheduler, - 'multi_precision': True} + 'lr_scheduler': lr_scheduler} + + # Add 'multi_precision' parameter only for SGD optimizer + if args.optimizer == 'sgd': + optimizer_params['multi_precision'] = True + + # Only a limited number of optimizers have 'momentum' property + has_momentum = {'sgd', 'dcasgd'} + if args.optimizer in has_momentum: + optimizer_params['momentum'] = args.mom monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None From 0b1363116c84dcefa751a925749b2da04c3f2614 Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Tue, 22 Aug 2017 14:56:33 -0700 Subject: [PATCH 016/448] Sparse Tensor: request for reviews (#7082) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [WIP] Sparse Tensor (#5800) * squash merge with 38f7c5584016e92ba1e0ee1b00ea6632740f67ce compiles on GPU update check alloc: Checkpoint. Pass elem-sum gpu test bug fix for copyfromto. sparse sgd test pass on gpu inefficient implementation for csr copy update submodule fix lint Simple bind with infer storage type (#32) * Symbol binding for sparse tensor development. (#31) * Initial checkin * Add init functions for simple bind in graph_executor * Add simple_bind c_api * Add simple bind c-api * Assign zeros to in_args, arg_grads, and aux_states * Add simple_bind2 python interface * Fix python interface bugs * Interface changes * Fix * Fix core dump * Add bind_ith_exec c_api * Change simple_bind2 * Fix seg fault * Finish simple_bind * Change _bind_ith_exec * Refactor simple_bind initialization flow for bind * Consolidate bind and simple_bind graph init flow * Fix bug * Clean up * Add comments * Clean up * Clean up * Minor correction * Rename APIs in graph executor * Refactor * Rebase * Delete deprecated functions * Move more front-end work to backend * Bug fix * Fix failed tests * Minor fix * Fix lint * Fix lint * Revert unnecessary changes * Revert * Revert * Clean up * Fix lint Conflicts: python/mxnet/symbol.py src/executor/graph_executor.cc * Add inferstorage to graph executor * re-enable tests for sparse embedding with simple_bind * type switch fix in sparse embedding" ; change `default` to `default_storage` for cast storage op (#33) * change default to default_storage * disable cpp test build temporarily attempt to fix windows build error, and fix lint (#34) update nnvm submodule (#37) Scipy build (#38) * update nnvm submodule * add scipy pip install for dockerfile Python3 unit tests (#39) * change xrange to range for python3 compatiblity" * remove more xrange from tests replace long with int for python3 (#40) fix the rest of TShape constructor errors (#41) fix lint (#42) fix wrong usage of mshadow::Shape1" (#43) implementation for Csr slice on cpu (#36) * CPU implementation for CSR remove seg_len from csr slice add some docs for slice csr change indptr, values, etc to be private member bug fix in sparse embedding update nnvm submoduel fix lint update unit test for sparse nd" * add const for SliceCsrIndPtr kernel Fix sparse dot according to the new RSP definition (#35) * Fix csr dot dns * Fix sparse dot * Add fallback and test cases for dot(csr, dns)=dns * Add int type switch * Fix * Fix * Fix update mshadow submodule (#44) Fix dns to rsp (#46) fix lint (#47) add runtime storage fallback detection" (#48) * add runtime storage fallback detection" * replace cast storage ex with cast storage impl Fm example (#45) * update csr slice logic to avoid confusion. add more exmaples. * add hint to module.update * more testcases(fallback) for sparse_nd * add to_csr() and to_rsp() method. More unit test (fallback now) * add fm test. fix lint * register sparse sgd under Optim.SGD * update dmlc-core submoduel * change indptr to _indptr temporarily. add const ref to fname fix lint fix lint; (#51) Guard gpu cast storage (#50) * Clean up * Fix typo Rearrange unit test files (#52) fix lint. add scipy for python_test. fix scipy.sparse import error. fix truediv for python3 fix travis test (#54) * remove pyc files * add verbose for travis nosetests cleanup some testing code and enums (#57) * update Makefile * refactor test_sparse_operator * change `default_storage` back to `default` * remove unused cpp tests port libsvm parser to mxnet as libsvm iter (#55) * copied csv iter to libsvm iter test libsvm iter draft handle round batch == false for csr batch loader code refactoring add get stype, shape interface to iiter separate class for sparse iter add missing file fix mem corruption' rename variables add comments also read label from libsvm add test. update docs. update submodule Conflicts: python/mxnet/sparse_ndarray.py * update submodule * fix lint * update test * revert naming change add benchmark scritp for dot (#59) * add benchmark scritp for dot add gpu option for bench add get_data funciton for benchmark print t_sparse, too; add comment change nnz to dnesity add backward * add comment update fm test (#62) introduce CSRNDarray and rowsparseNDarray to python frontend api (#58) * introduce CSRNDarray and rowsparseNDarray to python frontend api * temporarily disable fm_module test fix lint (#64) fix typo. disable libsvm io test (#65) Improve dot (#61) * Init checkin * Fix * Adjust dot parallelization methods * Set num_omp_threads for benchmark from command line * Fix omp thread number * Clean up * Add scipy as dot baseline * Fix format sparse_retain op (#66) * Initial checkin * Fix bugs * Add unit test for sparse_retain * Add example and modify test add storage cast for outputs that have non-default storage (#67) fix gpu build (#69) Fix test_sparse_retain python3 issue (#68) revert nnvm version * draft for sgd rsp rsp (#75) support sgd(rsp, rsp) support dot(csr, rsp) when rsp is full add ref to const ndarray params support sparse embedding with rsp weight' fix lint modify embedding backward to produce dense grad remove invalid_rid for rsp->dns remove previous embedding op changes pass sparse embedding test add STORAGE_TYPE_ASSIGN_CHECK remove backward storage infer * fix lint (#78) * fix lint (#79) * serial elemwise sum impl (#80) update module kvstore interface add other missing params and functions revert some interface changes revert some more changes reomve explicit casting for gradients on kvstore update Comm interface update fm example Conflicts: python/mxnet/model.py python/mxnet/ndarray.py * bug fix for initializing module with row_sparse weight (#81) * bug fix for initializing module with row_sparse weight * update log message * Sparse ndarray serialization and deserialization (#77) * Initial checkin * Add unit tests * Fix lint * Fix lint (#84) * Sgd with row_sparse weight, dns gradient (#83) * sgd rsp dns draft * support sgd_mom(rsp, dns, rsp) * update doc * remove cast storage for kv updater * code refactoring * update mshadow version (#88) * csr slice bug fix (#90) * benchmark dot code refactor (#87) * q^x6x add some code in benchmark * refactor * minor fixes * fix * lint fix * Add unit test (#91) * add unittest * minor fix * remove commented lines * change test func name * add test rsp * kvstore push row sparse (#93) * Add multi-thread cpu elemwise sum for rsps * Minor fix * Add flag to switch between serial and multi-thread kvstore push * Fix lint in sparse_ndarray.py * Revert "Fix lint in sparse_ndarray.py" This reverts commit d7225ec267a1e8c0c3c8074d25af5844ed39a14d. * Fix ndarray init in copy(ctx) * Add env var to control the flow of serial/parallel reduce * Refactor * Fix copy ndarray bug * Fix lint * Refactor * Fix windows openmp build failure (#94) * update mshadow submoduel (#95) * Revert "update mshadow submoduel (#95)" (#96) This reverts commit 1a129e4cc39514a6c7b3aa1189949969b818aec3. * Refactor sparse tensor code (#99) * Initial checkin test_sparse_ndarray passes * Fix test failure * Clean up * Clean up * Move init backend op to ndarray_utils * Fix lint * Eliminate circular dependency on headers * More refactor * Fix gpu build and consolidate Slice for dense and sparse * Clean up * More refactor * Clean up * Fix gpu build * Fix comment * fix pylint (#100) * Fix refactor sparse gpu test (#104) * Fix gpu build * Fix * Fix gpu test failure * change idx types from int32 to int64 (#101) Conflicts: python/mxnet/test_utils.py tests/python/unittest/test_sparse_operator.py update mshadow submodule fix extra quotes in test script change indptr type to int64 better err message for rsp" * revert LOG(DEBUG) change (#105) * fix undefined zeros in optimizer.py (#106) * move init dns zeros to init_op.h for kvstore to use (#107) * Refactor cast storage (#109) * Refactor cast_storage * Add cast_storage cc and cu files * Remove redundant comments * Replace std::accumulate with ParallelAccumulate * Clean up * Fix windows build * Rowsparse kv (#111) * update kvstore unit test Conflicts: tests/python/unittest/test_kvstore.py update model/module.py Conflicts: python/mxnet/model.py python/mxnet/module/module.py fix lint resolve conflict remove int keys in kvstore update cast to str function * fix failed dist_sync_kv test * bug fix in comm to ensure merged gradient is of the right type bug fix in comm * row sparse dist kvstore draft (push only) row_sparse pull * add ndarray row sparse shared mem constructor * code refactoring * add test for row_sparse weight bug fix for kv server slicing add async support rsolve race condition in kvstore * resolve error after reb ase * fix lint (#113) * rename some python funciton (#114) * _to_rsp * _to_csr. raise NotImplementedError * todense * fix lint (#115) enable libsvm uniit test (#6839) remove shared mem slice for csr add csr ndarray iter test make osx nose test verbose disable libsvm iter test Move InferAttr to mxnet from nnvm (#6830) * Move InferAttr to mxnet from nnvm Replace nnvm infer attr functions in c_api Initial checkin Clean up Remove nnvm namespace for FInferShape, FInferType, and FInferStorageType Add new interface for InferStorageType Revert "Remove nnvm namespace for FInferShape, FInferType, and FInferStorageType" This reverts commit 8aedf054bfe29b076c6fcb6f54d996fd2752e4de. Fix and clean up Fix lint Add nnvm changes Change infer function interface to accept only rvalue reference of graph Clean up Flush commits to show up in PR Add error handling for storage type inference failure Update nnvm * Fix pylint Change idx type switch for aux data (#6860) * Change idx type switch for aux data * Add mshadow commit Sparse dot enhancement (#6842) * Initial checkin Initial checkin Fix sparse dot test Fix unitest and add fallback for sparse dot * Add benchmark code * Revert "Add benchmark code" This reverts commit be009fe4c5a2a321aa92e99ac6e9cc511198c742. * Fix bug * Fix storage shape * Remove unnecessary test code * Use idx type switch Implement dot(csr, rsp)=dns and dot(csr.T, rsp)=rsp and refactor (#6902) * Initial checkin Add dot(csr.T, rsp)=rsp2 Add infer storage for dot(csr, rsp)=dns and dot(csr.T, rsp)=rsp2 * Fix comments * Replace std::lower_bound with own impl for gpu use too * Add time profiling * Revert "Add time profiling" This reverts commit 8f5bb982867731df0305148b1b150b05661f8529. * Move dot and batch_dot to a single file * Move dot gpu impl to a .cuh file * More refactor * Fix include error LibsvmIter fix (#6898) * fix bug in libsvm iter which causes mem corruption * add test for news dataset * fix wrong path in test * fix import error for urllib * update url * replace bz command with bz module Optimized gpu dot kernels (#6937) * pulled update to mshadow * mshadow update * added optimized gpu kernels for dot(csr,dns)=dns and dot(csr.T,dns)=dns, and unit test * added __syncwarp to vector kernel and reduced number of writes to shared memory Refactor sparse tensor code (#6955) * Save stype in frontend to avoid c-api call for stype * Change storage_type to stype * Revert "Change storage_type to stype" This reverts commit 90db7d18b624f3ee4ffd37bf5680205e77ca2763. * Revert "Revert "Change storage_type to stype"" This reverts commit 09328382e926b92a42ba5b3df6f169f825975d88. Move ndarray.py, sparse_ndarray.py, ndarray_utils.py, and _ndarray_internal to ndarrary folder More refactor Move elementwise sum for rsp to ndarray_function.cc Remove unnecessary import in ndarray module Fix pylint Remove redundant code Remove _stype from slots Fix cpp-package build error caused by the change to imperative invoke interface Use relative import Remove print line Rename _ndarray_internal.py to _internal.py * Relaunch test... minor bug fix in warp synchronous code (#7029) * move storage type vector from nnvm to mxnet (#7054) * move storage type vector from nnvm to mxnet * update nnvm * update nnvm * Improve copy sparse tensors (#7003) * Use cast_storage when copying ndarrays of different stypes on same context * Relaunch test * fix failed tests. add back 64bit support for dot fix lint * bug fix for IdentityComputeRsp * fix lint fix lint fix lint * add data partition for libsvm iter (#7027) * remove sparse embedding (#7165) * fix ndarray namespace * remove untested gpu operators (#7172) * skip sparse dot gpu tset. add sparse_nd_zeros gpu test * remove sparse_retain gpu Conflicts: tests/python/gpu/test_operator_gpu.py * Fix ndarray aux data issue (#7098) * Fix getting sparse ndarray data/aux_data issues * Add tests for func csr and row_sparse * Make get/set data/aux_data thread safe * Fix a bug * Fix typo and comment * More comments * Correct comment Conflicts: tests/python/gpu/test_operator_gpu.py * Support K-dimensional row-sparse tensor (#7179) * remove check for k dimensional rowsparse tensor * change var name for rsp sgd operator * add checks for sparse dot * bug fix for kdim rowsparse cast storage cpu * update IdentityLikeRhsComputeEx interface * remove set_storage_shape from ndarray. support elemwise_add with kdim row_sparse tensor * use get_with_shape instead of reshape * update according to comments Conflicts: src/operator/tensor/elemwise_unary_op.h * Improve sparse ndarray error message (#7181) * add test for broadcast_to * add comments Conflicts: python/mxnet/base.py * construct row_sparse ndarray for dist-async fix bug in rsp add rsp sync push race condition for push fix bug in rsp pull. refactor test cleanup comments refactor dist server fix lint fix storage shape issue with the new ndarray constructor data sharding draft; fix lint. add comment add support for zeros gradients use std::upper_bound/lower_bound remove special init function for rowsparse dist kvstore temporary support for inplace operators for sparse add test. fix return type store kRowSparseNDArray in kv server remove fcomp_ex sgd with dns weight and rsp gradient bug fix in sparse retain sparse pull c_api revise rowsparse pull api use engine to compute unique to ensure thread safety add rowsparse pull to dist-kv fix lint add example for rsp_pull remove name2idx; add sparse_pull_dict param to module fix unit test and c rowid conversion support str key type in kvstore (#6765) * update kvstore unit test * update model/module.py * fix lint * remove int keys in kvstore * update cast to str function * remove _cast_to_str_keys * fix lint * always cast to str Conflicts: include/mxnet/c_api.h include/mxnet/kvstore.h python/mxnet/kvstore.py python/mxnet/model.py python/mxnet/module/module.py src/c_api/c_api.cc src/kvstore/kvstore_local.h tests/python/unittest/test_kvstore.py update module API for other submodules update stypes in kvstore after refactoring change type of size from size_t to int64_t add sparse linear regression example remove sparse_pull_dict from module fix init_optim for seq_module. update sparse example resolve conflict for binary add rsp rsp Conflicts: python/mxnet/kvstore.py tests/python/unittest/test_kvstore.py * fix DotCsrRspRspImpl error message (#7191) * GPU implementation of cast_storage (dense to csr) (#7081) * Added gpu implementation for cast_storage dense to csr, unit tests, and benchmark. Additionally, cast_storage interface change to accommodate the need of temporary storage in cuda kernels. * fixed whitespace * minor unittest update * removed whitespace * add cast storage benchmark params info Conflicts: tests/python/gpu/test_operator_gpu.py * Sparse square sum (#7206) * Add square_sum op * Add unit test and fix check_numeric_gradient * Add .cu file and example * Fix lint * Remove gpu registration * Use square_sum in test_module_fm * Modify and Add documentation for mx.nd.zeros (#7197) * Modify and Add documentation for mx.nd.zeros * Change context to cpu * Change stype to optional * Change ordering and remove optional for _zeros_sparse_ndarray * Expose kWriteInplace for imperative execution (fcompute_ex and fstatefulcompute_ex) (#133) * expose kWriteInplace to FComputeEx and FStatefulComputeEx * refactor ccode * remove duplicated test * Operator add_n for row sparse ndarrays (#7244) * Add add_n op for row-sparse ndarrays and identity FComputeEx * Fix bug in square_sum * Remove test_cast_storage_ex from gpu test since it's not implemented yet * Fix according to the cr Conflicts: src/operator/tensor/elemwise_sum.cc src/operator/tensor/elemwise_unary_op.cc tests/python/gpu/test_operator_gpu.py resolve conflict * GPU implementation of cast_storage (dense to rsp) (#7223) * CastStorageDnsRsp GPU Implementation * updating function doc and some variable types and names * adding cuda_get_device_prop() util function * added rand_shape function for n-dimensional tensors * updated cast storage unit test * added dns_to_rsp to cast storage benchmark script * removing redundant unit test * fix lint * minor change in benchmark script * fix lint * correct function description * change storage_type to stype * changed scope of using namespaces * changed variable types from index_t to dim_t * resolve merge conflict in ndarray.load * Improve StatefulOp/FCompute storage fallback (#134) * test for fcomp fallback add storage fallback test and optimize fallback logic rename function, add comments use std size() * add autograd test with sparse inputs * update sparse ndarray api (#139) * support mx.nd.empty for sparse ndarray Change SparseNDArray to BaseSparseNDArray support mx.nd.array with BaseSparseNDArray inputs. Update documentation with explicit subclasses of NDArrays Conflicts: python/mxnet/ndarray/__init__.py python/mxnet/ndarray/ndarray.py python/mxnet/ndarray/sparse_ndarray.py tests/python/unittest/test_sparse_ndarray.py * fix print msg in test * Handle ograd_stype='row_sparse' for square_sum backward (#143) * Add one kernel for square_sum backward pass to take rsp ograd * Add kNullOp and change to use type_assign in infer stype fallback * Sparse retain improvement (#138) * Add one more kernel for sparse retain * Fix compile * Change STORAGE_TYPE_ASSIGN_CHECK to type_assign for fallback * Fix * Add gpu compile * ignoring variables in SimpleBind that is used on python's sparse branch for now. (#135) * add bias term to fm test (#145) * update ndarray.nd, remove `invoke` from excluded members (#137) remove __weakref__ from SparseNDArray add data indice to doc revert dlpack update revert mxdoc changes move methods from BaseSparseNDarray to csrndarray and rwosparse ndarray * support storage fallback with mutable inputs (#147) * include mutatable inputs in storage fallback. refactor executor add fallback test for rms prop and adam fix lint fix lint fix test in optimizer * update according to comments * fix unit tests * fix gpu compilation err * Code changes based on reviews (#144) * code changes according to review comments remove executor debug. add doc to optimizer update sparse sgd test add dtype option to rand_sparse_ndarray * overhauled reqs for sparse operators * patch FCompExFallback with mutable inputs. update test_optimizer with more fallback cases * change executor debug macro to env var * add comment * update doc * change ndarray.aux_shape() to return const reference * remove todense to_rsp to_csr. replace with tostype * replace manual calls to cast_storage with tostype * disable gpu fallback test for optimizer * fix lint * add backward pass for cast_storage. refactor cast_storage test * rand_sparse_ndarray bug fix * fix cast_storage for gpu * disable csr test for fp16 * update row sparse ndarray doc * update doc * small edits according to reviews (#151) * fix lint (#152) * add license to all new files in sparse brnach (#154) * Allocate temp data on the fly for some casting operations (#149) * fix utf8 encoding in sparse ndarray * Extending the GPU dot operator (#7226) * Added GPU DotCsrRspDnsImpl declaration and TODOs * cleaning up function doc, variable types, and code-style * minor bug fixes * enable GPU dot(csr,rsp)=dns unit test * extend sparse dot unit test * adding GPU impl of DotCsrRspDns and its kernels * add TODO * changed variable types from index_t to dim_t * fix function description * added DotCsrRspRspImpl and its kernels (baseline, functionality) * added DotCsrDnsRspImpl and its kernels (baseline, functionality); plus code documentation * refactored dot benchmark * optimized DotCsrTransDnsRsp GPU kernel * change of dot impl interface to include OpContext, for temp storage * removing __device__ flag from CPU kernels * minor fixes and changing variable data types * minor fixes based on code reviews Conflicts: benchmark/python/sparse_op.py tests/python/gpu/test_operator_gpu.py tests/python/unittest/test_sparse_operator.py * Add get_synthetic_dataset function to util (#146) * Add get_synthetic_datasets * Move to test_utils * Remove _get_uniform_dataset * Move validation to its own function * Refactor the validation code for csr generation * Make test_powerlaw a nested function * Change SparseNDArray to CSRNDArray * Merge with dtype specific changes in test_utils * temporary fix for batch norm storage fallback (#156) * support random_uniform/normal/gamma with row_sparse output (#155) * add support for initilazer with rowsparse output * add scalar assignment to row_sparse * add setitem test to gpu * Revert "add scalar assignment to row_sparse" This reverts commit 8aef7a56c44038f67bbec93811977ea2f9fa3c30. * Revert "add setitem test to gpu" This reverts commit 3b969ac0980e8d7166a1cf46878ed2bd457986ed. * Square sum backward support one more case (#161) * Add documentation for sparse ops (#148) * draft doc for sparse op * add more stype doc for operators * add doc for cast_storage * see also cast_storage. remove base sparse ndarray. fix aux_types comemtn * grammar / spelling fix * A few fixes (#163) * fix batch norm gpu kernel. register random operators on gpu * register sparse random op on gpu, too * Minor fixes sparse ops (#160) * change CPU kernel inline directives, data types, and function doc * update dot dtype switch to use 32 and 64bit floating point only * use type_assign instead of STORAGE_TYPE_ASSIGN_CHECK * added tensor_util-inl.cuh file for common tensor operator GPU kernels * sparse Adam optimizer (#164) * add sparse adam * register gpu op * add comments * cr comments * kvstore.row_sparse_pull for GPU and end-to-end benchmark: CPU vs. multi-GPUs (#150) * Add gpu support for BroadcastRowSparse * Fix bugs * Add benchmark script * Increase output dim size * Update weight on CPU using single GPU for sparse tensors * More fix * Optimize sparse_retain for special case * Change row sparse pull locations * Avoid sparse retain on cpu if possible * Use acc for metric * Fix misc * fix bug in adam update (#167) fix a bug in adam update * change sparse example from regression to classification (#165) * fix python import (#166) * Add waitall to sparse_end2end.py (#169) * Add waitall() * Add dummy metric option * Add header license * Dot script changes (#159) * Add get_synthetic_datasets * Move to test_utils * Remove _get_uniform_dataset * Move validation to its own function * Refactor the validation code for csr generation * Make test_powerlaw a nested function * Change SparseNDArray to CSRNDArray * Refactoring changes to dot.py * Fix mxnet test_utils changes * Remove pdb statement * Add distribution parameter * Refactor benchmarking script * Remove unused code * Make style changes and remove unused code * Change typo in comment * Add transpose support * Change typo * 4 decimal points needed for density * Add rsp support for real datasets * Correct variable name mini_file_name * Move wait_to_read outside if * Seperate out scipy and mxnet logic in bench_dot * Fix lhs_trans issue * Move transpose outside measure_cost * Compute transpose inside measure_cost * Remove unused variables * Transpose only if trans_lhs (#171) * fix default val for distribution (#172) * fix lint (#175) * avoid cast_storage in dist-kvstore-server (#174) * avoid cast_storage in dist-kvstore-server * add stream arg to mshadow;;copy * fix copy order * Add sparse namespace to ndarray and symbol (#177) * Register dot, cast_storage, and sparse_retain under mxnet.ndarray.sparse * Add sparse to symbol namespace * Delete commented code * mv sparse_ndarray.py sparse.py * Clean up * Change docstring * changes based on code reviews (#176) * remove scipy dependency * move kvstore checks to backned * add const to lambda * temp fix to ndarray.md (#178) * Fix sparse namespace pylint (#179) * add comments and error msg (#181) * add clarification for csr (#182) * add clarification for csr * cr comments * revert change in test util (#183) * fix amalgamation (#184) * fix lint --- benchmark/python/sparse/cast_storage.py | 99 ++ benchmark/python/sparse/dot.py | 445 ++++++++ benchmark/python/sparse/sparse_end2end.py | 249 ++++ benchmark/python/sparse/sparse_op.py | 245 ++++ benchmark/python/sparse/util.py | 50 + docs/api/python/ndarray.md | 63 +- example/sparse/get_data.py | 32 + example/sparse/linear_classification.py | 185 +++ include/mxnet/c_api.h | 240 +++- include/mxnet/executor.h | 1 + include/mxnet/graph_attr_types.h | 48 + include/mxnet/kvstore.h | 24 + include/mxnet/ndarray.h | 502 +++++++- include/mxnet/op_attr_types.h | 18 +- include/mxnet/storage.h | 4 +- perl-package/AI-MXNetCAPI/mxnet.i | 6 + perl-package/AI-MXNetCAPI/mxnet_typemaps.i | 11 + python/mxnet/__init__.py | 3 +- python/mxnet/_ctypes/ndarray.py | 39 +- python/mxnet/base.py | 14 + python/mxnet/contrib/autograd.py | 1 + python/mxnet/executor.py | 5 +- python/mxnet/image/detection.py | 2 +- python/mxnet/image/image.py | 6 +- python/mxnet/io.py | 5 +- python/mxnet/kvstore.py | 67 +- python/mxnet/model.py | 8 +- python/mxnet/module/base_module.py | 3 +- python/mxnet/module/module.py | 8 +- python/mxnet/ndarray/__init__.py | 25 + .../_internal.py} | 0 python/mxnet/{ => ndarray}/ndarray.py | 499 +++----- python/mxnet/ndarray/op.py | 209 ++++ python/mxnet/ndarray/sparse.py | 923 +++++++++++++++ python/mxnet/ndarray/utils.py | 240 ++++ python/mxnet/optimizer.py | 23 +- python/mxnet/random.py | 14 +- python/mxnet/symbol/__init__.py | 23 + .../_internal.py} | 0 python/mxnet/symbol/op.py | 242 ++++ python/mxnet/symbol/sparse.py | 18 + python/mxnet/{ => symbol}/symbol.py | 275 +---- python/mxnet/test_utils.py | 227 +++- src/c_api/c_api.cc | 116 ++ src/c_api/c_api_common.h | 2 + src/c_api/c_api_executor.cc | 32 +- src/c_api/c_api_ndarray.cc | 237 +++- src/c_api/c_api_symbolic.cc | 5 +- src/c_api/c_predict_api.cc | 3 +- src/common/utils.cc | 39 + src/common/utils.cu | 39 + src/common/utils.h | 167 ++- src/executor/attach_op_execs_pass.cc | 176 ++- src/executor/exec_pass.h | 52 +- src/executor/graph_executor.cc | 326 ++++-- src/executor/graph_executor.h | 9 +- src/executor/infer_graph_attr_pass.cc | 356 ++++++ src/executor/inplace_addto_detect_pass.cc | 2 + src/io/iter_batchloader.h | 17 +- src/io/iter_libsvm.cc | 288 +++++ src/io/iter_prefetcher.h | 32 +- src/io/iter_sparse.h | 45 + src/io/iter_sparse_batchloader.h | 203 ++++ src/io/iter_sparse_prefetcher.h | 153 +++ src/kvstore/comm.h | 304 ++++- src/kvstore/kvstore_dist.h | 252 ++++- src/kvstore/kvstore_dist_server.h | 249 +++- src/kvstore/kvstore_local.h | 151 ++- src/ndarray/ndarray.cc | 450 +++++++- src/ndarray/ndarray_function-inl.h | 61 +- src/ndarray/ndarray_function.cc | 134 +++ src/ndarray/ndarray_function.h | 9 + src/nnvm/legacy_op_util.cc | 34 +- src/operator/batch_norm.cc | 2 +- src/operator/batch_norm.cu | 4 +- src/operator/deconvolution-inl.h | 2 +- src/operator/elemwise_op_common.h | 48 + src/operator/leaky_relu-inl.h | 5 +- src/operator/mxnet_op.h | 24 +- src/operator/operator_common.h | 82 ++ src/operator/optimizer_op-inl.h | 493 ++++++++ src/operator/optimizer_op.cc | 9 + src/operator/optimizer_op.cu | 9 +- src/operator/random/sample_op.cc | 9 +- src/operator/random/sample_op.cu | 60 +- src/operator/random/sample_op.h | 109 +- src/operator/tensor/cast_storage-inl.cuh | 589 ++++++++++ src/operator/tensor/cast_storage-inl.h | 392 +++++++ src/operator/tensor/cast_storage.cc | 87 ++ src/operator/tensor/cast_storage.cu | 35 + src/operator/tensor/dot-inl.cuh | 883 +++++++++++++++ src/operator/tensor/dot-inl.h | 1007 +++++++++++++++++ src/operator/tensor/dot.cc | 141 +++ src/operator/tensor/dot.cu | 45 + .../elemwise_binary_broadcast_op_basic.cc | 1 + src/operator/tensor/elemwise_binary_op.h | 169 ++- .../tensor/elemwise_binary_op_basic.cc | 20 +- src/operator/tensor/elemwise_sum.cc | 66 +- src/operator/tensor/elemwise_unary_op.cc | 9 +- src/operator/tensor/elemwise_unary_op.cu | 7 +- src/operator/tensor/elemwise_unary_op.h | 113 +- src/operator/tensor/indexing_op.cc | 1 - src/operator/tensor/indexing_op.h | 3 + src/operator/tensor/init_op.cc | 1 + src/operator/tensor/init_op.cu | 3 +- src/operator/tensor/init_op.h | 88 +- src/operator/tensor/matrix_op-inl.h | 449 ++------ src/operator/tensor/matrix_op.cc | 94 +- src/operator/tensor/matrix_op.cu | 12 - src/operator/tensor/sparse_retain-inl.h | 396 +++++++ src/operator/tensor/sparse_retain.cc | 80 ++ src/operator/tensor/sparse_retain.cu | 36 + src/operator/tensor/square_sum-inl.h | 456 ++++++++ src/operator/tensor/square_sum.cc | 52 + src/operator/tensor/util/tensor_util-inl.cuh | 240 ++++ .../ci_build/install/ubuntu_install_python.sh | 4 +- tests/cpp/operator/batchnorm_test.cc | 6 +- tests/nightly/dist_sync_kvstore.py | 166 ++- tests/python/gpu/test_kvstore_gpu.py | 68 ++ tests/python/gpu/test_operator_gpu.py | 3 + tests/python/unittest/test_autograd.py | 75 +- tests/python/unittest/test_infer_shape.py | 20 +- tests/python/unittest/test_io.py | 106 ++ tests/python/unittest/test_kvstore.py | 134 ++- tests/python/unittest/test_module.py | 105 +- .../python/unittest/test_multi_device_exec.py | 27 + tests/python/unittest/test_ndarray.py | 1 + tests/python/unittest/test_operator.py | 113 +- tests/python/unittest/test_optimizer.py | 182 ++- tests/python/unittest/test_sparse_ndarray.py | 524 +++++++++ tests/python/unittest/test_sparse_operator.py | 373 ++++++ tests/travis/run_test.sh | 20 +- tests/travis/setup.sh | 4 +- 133 files changed, 15525 insertions(+), 1781 deletions(-) create mode 100644 benchmark/python/sparse/cast_storage.py create mode 100644 benchmark/python/sparse/dot.py create mode 100644 benchmark/python/sparse/sparse_end2end.py create mode 100644 benchmark/python/sparse/sparse_op.py create mode 100644 benchmark/python/sparse/util.py create mode 100644 example/sparse/get_data.py create mode 100644 example/sparse/linear_classification.py create mode 100644 include/mxnet/graph_attr_types.h create mode 100644 python/mxnet/ndarray/__init__.py rename python/mxnet/{_ndarray_internal.py => ndarray/_internal.py} (100%) rename python/mxnet/{ => ndarray}/ndarray.py (87%) create mode 100644 python/mxnet/ndarray/op.py create mode 100644 python/mxnet/ndarray/sparse.py create mode 100644 python/mxnet/ndarray/utils.py create mode 100644 python/mxnet/symbol/__init__.py rename python/mxnet/{_symbol_internal.py => symbol/_internal.py} (100%) create mode 100644 python/mxnet/symbol/op.py create mode 100644 python/mxnet/symbol/sparse.py rename python/mxnet/{ => symbol}/symbol.py (90%) create mode 100644 src/common/utils.cc create mode 100644 src/common/utils.cu create mode 100644 src/executor/infer_graph_attr_pass.cc create mode 100644 src/io/iter_libsvm.cc create mode 100644 src/io/iter_sparse.h create mode 100644 src/io/iter_sparse_batchloader.h create mode 100644 src/io/iter_sparse_prefetcher.h create mode 100644 src/operator/tensor/cast_storage-inl.cuh create mode 100644 src/operator/tensor/cast_storage-inl.h create mode 100644 src/operator/tensor/cast_storage.cc create mode 100644 src/operator/tensor/cast_storage.cu create mode 100644 src/operator/tensor/dot-inl.cuh create mode 100644 src/operator/tensor/dot-inl.h create mode 100644 src/operator/tensor/dot.cc create mode 100644 src/operator/tensor/dot.cu create mode 100644 src/operator/tensor/sparse_retain-inl.h create mode 100644 src/operator/tensor/sparse_retain.cc create mode 100644 src/operator/tensor/sparse_retain.cu create mode 100644 src/operator/tensor/square_sum-inl.h create mode 100644 src/operator/tensor/square_sum.cc create mode 100644 src/operator/tensor/util/tensor_util-inl.cuh create mode 100644 tests/python/gpu/test_kvstore_gpu.py create mode 100644 tests/python/unittest/test_sparse_ndarray.py create mode 100644 tests/python/unittest/test_sparse_operator.py diff --git a/benchmark/python/sparse/cast_storage.py b/benchmark/python/sparse/cast_storage.py new file mode 100644 index 000000000000..7ae537398c42 --- /dev/null +++ b/benchmark/python/sparse/cast_storage.py @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +from mxnet.test_utils import * +import os +import time +import argparse + +from mxnet.base import check_call, _LIB + +parser = argparse.ArgumentParser(description="Benchmark cast storage operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +def measure_cost(repeat, f, *args, **kwargs): + start = time.time() + results = [] + for i in range(repeat): + (f(*args, **kwargs)).wait_to_read() + end = time.time() + diff = end - start + return diff / repeat + + +def run_cast_storage_synthetic(): + def dense_to_sparse(m, n, density, ctx, repeat, stype): + set_default_context(ctx) + data_shape = (m, n) + dns_data = rand_ndarray(data_shape, stype, density).tostype('default') + dns_data.wait_to_read() + + # do one warm up run, verify correctness + assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy()) + + # start benchmarking + cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype) + results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000) + print(results) + + check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) + + # params + # m number of rows + # n number of columns + # density density of the matrix + # num_repeat number of benchmark runs to average over + # contexts mx.cpu(), mx.gpu() + # note: benchmark different contexts separately; to benchmark cpu, compile without CUDA + # benchmarks dns_to_csr, dns_to_rsp + m = [ 512, 512] + n = [50000, 100000] + density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01] + num_repeat = 10 + contexts = [mx.gpu()] + benchmarks = ["dns_to_csr", "dns_to_rsp"] + + # run benchmark + for b in benchmarks: + stype = '' + print("==================================================") + if b is "dns_to_csr": + stype = 'csr' + print(" cast_storage benchmark: dense to csr, size m x n ") + elif b is "dns_to_rsp": + stype = 'row_sparse' + print(" cast_storage benchmark: dense to rsp, size m x n ") + else: + print("invalid benchmark: %s" %b) + continue + print("==================================================") + headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)') + print(headline) + for i in range(len(n)): + for ctx in contexts: + for den in density: + dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype) + print("") + print("") + + +if __name__ == "__main__": + run_cast_storage_synthetic() diff --git a/benchmark/python/sparse/dot.py b/benchmark/python/sparse/dot.py new file mode 100644 index 000000000000..fe322821a09f --- /dev/null +++ b/benchmark/python/sparse/dot.py @@ -0,0 +1,445 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +import os +import time +import argparse +import subprocess +import scipy.sparse as sp + +import mxnet as mx +import numpy as np +import numpy.random as rnd +from mxnet.test_utils import rand_ndarray, set_default_context, assert_almost_equal +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +PARSER = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +PARSER.add_argument('--num-omp-threads', type=int, + default=1, help='number of omp threads to set in MXNet') +PARSER.add_argument('--gpu', action='store_true', + help="to be run on gpu") +# TODO: Use logging later +PARSER.add_argument('--verbose', action='store_true', + help="Verbose output") +ARGS = PARSER.parse_args() + +# some data information +KDDA = { + 'data_mini': 'kdda.t.mini', + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, + 'm': [1, 8, 32], + 'batch_size': [64], + 'default_index': {'batch_size': 0, + 'output_dim': 2}, + 'num_batches': 10 +} + +AVAZU = { + 'data_mini': 'avazu-app.t.mini', + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, + 'm': [1, 1000, 2000], + 'batch_size': [128, 256], + 'default_index': {'batch_size': 0, + 'output_dim': 1}, + 'num_batches': 10 +} + +CRITEO = { + 'data_mini': 'criteo.t.mini', + 'data_name': 'criteo.t', + 'data_origin_name': 'criteo.t.bz2', + 'url' : "https://s3-us-west-2.amazonaws.com/sparse-dataset/criteo.t.bz2", + 'feature_dim': 8388621, + 'm': [1, 8, 16, 32, 64], + 'batch_size': [64, 128], + 'default_index': {'batch_size': 1, + 'output_dim': 3}, + 'num_batches': 10 +} + +SYNTHETIC1 = { + 'feature_dim': [1000000], + 'm': [256, 1000], + 'density': [0.001, 0.005, 0.01, 0.02, 0.05, + 0.1, 0.2, 0.5, 0.65], + 'batch_size': [64, 128], + 'default_index': {'batch_size': 1, + 'density': 2, + 'output_dim': 1, + 'feature_dim': 0}, + 'num_repeat': 10 +} + +SYNTHETIC2 = { + 'feature_dim': [8000000, 16000000], + 'm': [1, 32], + 'density': [0.001, 0.005, 0.01, 0.02, 0.05, + 0.1, 0.2, 0.5, 0.65], + 'batch_size': [64, 128], + 'default_index': {'batch_size': 1, + 'density': 2, + 'output_dim': 1, + 'feature_dim': 0}, + 'num_repeat': 10 +} + +def measure_cost(repeat, scipy_trans_lhs, scipy_dns_lhs, func_name, *args, **kwargs): + """Measure time cost of running a function + """ + mx.nd.waitall() + args_list = [] + for arg in args: + args_list.append(arg) + start = time.time() + if scipy_trans_lhs: + args_list[0] = np.transpose(args_list[0]) if scipy_dns_lhs else sp.spmatrix.transpose(args_list[0]) + for _ in range(repeat): + func_name(*args_list, **kwargs) + mx.nd.waitall() + end = time.time() + diff = end - start + return diff / repeat + + +def _get_iter(path, data_shape, batch_size): + data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) + data_iter = iter(data_train) + return data_iter + + +def _line_count(path): + return int(subprocess.check_output('wc -l {}'.format(path), shell=True).split()[0]) + + +def _compare_sparse_dense(data_dir, file_name, mini_file_name, feature_dim, + output_dim, density, batch_size, num_batches=3, num_repeat=5, transpose=False, + rsp=False): + + def create_mini_path(mini_path, path, num_batches): + """Samples batches of size: batch_size, total number: num_batches + from the dataset files for running benchmarks""" + if not os.path.exists(mini_path): + last = _line_count(path) - num_batches * batch_size + last = last if last >= 1 else 1 + start = int(rnd.uniform(1, last)) + os.system("sed -n '%d,%dp' %r > %r" + %(start, start + num_batches * batch_size, path, mini_path)) + assert os.path.exists(mini_path) + + + def run_benchmark(mini_path): + """Run benchmarks + """ + data_shape = (feature_dim, ) + train_iter = _get_iter(mini_path, data_shape, batch_size) + weight_row_dim = batch_size if transpose else feature_dim + weight_shape = (weight_row_dim, output_dim) + if not rsp: + weight = mx.nd.random_uniform(low=0, high=1, shape=weight_shape) + else: + weight = rand_ndarray(weight_shape, "row_sparse", density=0.05, distribution="uniform") + total_cost = {} + average_cost = {} + count = 0 + total_cost["sparse"] = 0. + total_cost["dense"] = 0. + for _ in train_iter: + csr_data = train_iter.getdata() + dns_data = csr_data.tostype('default') + cost_sparse = measure_cost(num_repeat, False, False, mx.nd.dot, csr_data, weight, transpose_a=transpose) + cost_dense = measure_cost(num_repeat, False, False, mx.nd.dot, dns_data, weight, transpose_a=transpose) + total_cost["sparse"] += cost_sparse + total_cost["dense"] += cost_dense + count = count + 1 + average_cost["sparse"] = total_cost["sparse"] / count + average_cost["dense"] = total_cost["dense"] / count + return (average_cost["sparse"], average_cost["dense"]) + + + def print_result(average_cost_sparse, average_cost_dense): + """Print result of comparison between sparse and dense + """ + ratio = average_cost_dense / average_cost_sparse + fmt = '{:15.4f} {:10d} {:10d} {:10d} {:20.2f} {:15.2f} {:15.2f} {:10} {:10}' + print(fmt.format(density * 100, batch_size, output_dim, feature_dim, + ratio, average_cost_dense*1000, average_cost_sparse*1000, + transpose, rsp)) + + mini_path = os.path.join(data_dir, mini_file_name) + path = os.path.join(data_dir, file_name) + create_mini_path(mini_path, path, num_batches) + average_cost_sparse, average_cost_dense = run_benchmark(mini_path) + print_result(average_cost_sparse, average_cost_dense) + + +def test_dot_real(data_dict): + """Dot operator testing with real datasets""" + data_dir = os.path.join(os.getcwd(), 'data') + + path = os.path.join(data_dir, data_dict['data_name']) + if not os.path.exists(path): + get_data( + data_dir, + data_dict['data_name'], + data_dict['url'], + data_dict['data_origin_name'] + ) + assert os.path.exists(path) + + k = data_dict['feature_dim'] + m = data_dict['m'] + batch_size_list = data_dict['batch_size'] + + default_output_index = data_dict['default_index']['output_dim'] + default_batch_size_index = data_dict['default_index']['batch_size'] + density = estimate_density(path, data_dict['feature_dim']) + num_batches = data_dict['num_batches'] + + assert default_batch_size_index < len(batch_size_list) + assert default_output_index < len(m) + if ARGS.verbose: + print("Running Benchmarking on %r data") % data_dict['data_mini'] + print('{:>15} {:>10} {:>10} {:>10} {:>20} {:>15} {:>15} {:>10} {:>10}'.format('density(%)', + 'n', + 'm', + 'k', + 't_dense/t_sparse', + 't_dense(ms)', + 't_sparse(ms)', + 'is_transpose', + 'rhs_rsp')) + + + for output_dim in m: + _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'], + k, output_dim, density, + batch_size_list[default_batch_size_index], num_batches) + _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'], + k, output_dim, density, + batch_size_list[default_batch_size_index], num_batches, + transpose=True) + _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'], + k, output_dim, density, + batch_size_list[default_batch_size_index], num_batches, rsp=True) + + for batch_size in batch_size_list: + _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'], + k, m[default_output_index], density, batch_size, num_batches) + _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'], + k, m[default_output_index], density, batch_size, num_batches, + transpose=True) + _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'], + k, output_dim, density, + batch_size_list[default_batch_size_index], num_batches, rsp=True) + + +def test_dot_synthetic(data_dict): + """benchmark sparse mxnet dot and scipy dot operator with matrices of given density. + `t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the + runtime of dot(dns, dns), with the same matrices except that they are in default storage type. + """ + # Benchmark MXNet and Scipys dot operator + def bench_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype, + lhs_den, rhs_den, trans_lhs, ctx, num_repeat=10, fw="mxnet", distribution="uniform"): + set_default_context(ctx) + assert fw == "mxnet" or fw == "scipy" + # Set funcs + dot_func_sparse = mx.nd.dot if fw == "mxnet" else sp.spmatrix.dot + dot_func_dense = mx.nd.dot if fw == "mxnet" else np.dot + # Create matrix instances + lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den, distribution=distribution) + # only uniform distribution supported for rhs + rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den, distribution="uniform") + lhs_dns = None + rhs_dns = None + dense_cost = None + sparse_cost = None + + if fw == "mxnet": + lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.tostype('default') + rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default') + # One warm up run, verify correctness + out = dot_func_sparse(lhs_nd, rhs_dns, trans_lhs) + out_expected = dot_func_dense(lhs_dns, rhs_dns, trans_lhs) + assert_almost_equal(out.asnumpy(), out_expected.asnumpy(), rtol=1e-1, atol=1e-1) + sparse_cost = measure_cost(num_repeat, False, False, dot_func_sparse, lhs_nd, rhs_nd, trans_lhs) + dense_cost = measure_cost(num_repeat, False, False, dot_func_dense, lhs_dns, rhs_dns, trans_lhs) + else: + lhs_dns = lhs_nd.asnumpy() + rhs_dns = rhs_nd.asnumpy() + lhs_nd = sp.csr_matrix(lhs_nd.asnumpy()) + rhs_nd = rhs_nd.asnumpy() + # One warm up run, verify correctness + lhs_nd_copy = sp.spmatrix.transpose(lhs_nd) if trans_lhs else lhs_nd + out = dot_func_sparse(lhs_nd_copy, rhs_dns) + sparse_cost = measure_cost(num_repeat, trans_lhs, False, dot_func_sparse, lhs_nd, rhs_nd) + dense_cost = measure_cost(num_repeat, trans_lhs, True, dot_func_dense, lhs_dns, rhs_dns) + + speedup = dense_cost / sparse_cost + # Print results + m = lhs_shape[0] + k = lhs_shape[1] + n = rhs_shape[1] + result_pattern = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}' + results = result_pattern.format(lhs_den*100, + rhs_den*100, + str(ctx), + m, + k, + n, + sparse_cost*1000, + dense_cost*1000, + speedup) + print(results) + + def print_benchmark_info(lhs, rhs, lhs_trans, fw): + trans_str = "^T" if lhs_trans else "" + print("========================================================") + print(" %s sparse dot benchmark: dot(%s, %s) = %s ") % (fw, lhs, rhs, rhs) + print(" (matrix multiplication: (m x k)%s * (k x n) = m x n) ") % (trans_str) + print("========================================================") + headline_pattern = '{:>15} {:>15} {:>10} {:>8} {:>8} {:>8} {:>13} {:>13} {:>8}' + headline = headline_pattern.format('lhs_density(%)', + 'rhs_density(%)', + 'context', + 'm', 'k', 'n', + 't_sparse(ms)', + 't_dense(ms)', + 'speedup') + print(headline) + + + def run_benchmark(ctx=None, lhs="csr", lhs_trans=False, rhs="dns", fw="mxnet", rhs_density=1, + distribution="uniform"): + if lhs != "csr": + raise ValueError("Value other than csr for lhs not supported") + if rhs_density > 1 or rhs_density < 0: + raise ValueError("rhs_density has to be between 0 and 1") + + print_benchmark_info(lhs, rhs, lhs_trans, fw) + + + lhs_stype = "csr" + rhs_stype = "row_sparse" if rhs == "rsp" else "default" + + feature_dim_list = data_dict['feature_dim'] + output_dim_list = data_dict['m'] + batch_size_list = data_dict['batch_size'] + density_list = data_dict['density'] + + default_output_index = data_dict['default_index']['output_dim'] + default_batch_size_index = data_dict['default_index']['batch_size'] + default_feature_index = data_dict['default_index']['feature_dim'] + default_density_index = data_dict['default_index']['density'] + num_repeat = data_dict['num_repeat'] + + for output_dim in output_dim_list: + if lhs_trans: + output_row_dim = batch_size_list[default_batch_size_index] + else: + output_row_dim = feature_dim_list[default_feature_index] + bench_dot((batch_size_list[default_batch_size_index], + feature_dim_list[default_feature_index]), + (output_row_dim, output_dim), + lhs_stype, rhs_stype, + density_list[default_density_index], rhs_density, + lhs_trans, ctx, num_repeat=num_repeat, + fw=fw, distribution=distribution) + + for feature_dim in feature_dim_list: + if lhs_trans: + output_row_dim = batch_size_list[default_batch_size_index] + else: + output_row_dim = feature_dim + bench_dot((batch_size_list[default_batch_size_index], feature_dim), + (output_row_dim, output_dim_list[default_output_index]), + lhs_stype, rhs_stype, density_list[default_density_index], rhs_density, + lhs_trans, ctx, num_repeat=num_repeat, fw=fw, distribution=distribution) + + for batch_size in batch_size_list: + if lhs_trans: + output_row_dim = batch_size + else: + output_row_dim = feature_dim_list[default_feature_index] + bench_dot((batch_size, feature_dim_list[default_feature_index]), + (output_row_dim, + output_dim_list[default_output_index]), + lhs_stype, rhs_stype, density_list[default_density_index], + rhs_density, lhs_trans, ctx, num_repeat=num_repeat, + fw=fw, distribution=distribution) + + for density in density_list: + if lhs_trans: + output_row_dim = batch_size_list[default_batch_size_index] + else: + output_row_dim = feature_dim_list[default_feature_index] + bench_dot((batch_size_list[default_batch_size_index], + feature_dim_list[default_feature_index]), + (output_row_dim, + output_dim_list[default_output_index]), + lhs_stype, rhs_stype, density, rhs_density, lhs_trans, ctx, + num_repeat=num_repeat, fw=fw, distribution=distribution) + + check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(ARGS.num_omp_threads))) + context = mx.gpu() if ARGS.gpu else mx.cpu() + # TODO(anirudh): make the data dicts to config which can be passed at runtime + distributions = ["uniform", "powerlaw"] + for distribution in distributions: + run_benchmark(context, lhs="csr", + rhs="default", lhs_trans=False, + fw="mxnet", rhs_density=1, + distribution=distribution) + run_benchmark(context, lhs="csr", + rhs="default", lhs_trans=True, + fw="mxnet", rhs_density=1, + distribution=distribution) + run_benchmark(context, lhs="csr", + rhs="rsp", lhs_trans=False, + fw="mxnet", rhs_density=0.05, + distribution=distribution) + if not ARGS.gpu: + run_benchmark(context, lhs="csr", + rhs="default", lhs_trans=False, + fw="scipy", rhs_density=1, + distribution=distribution) + run_benchmark(context, lhs="csr", + rhs="default", lhs_trans=True, + fw="scipy", rhs_density=1, + distribution=distribution) + + +if __name__ == "__main__": + begin_time = time.time() + test_dot_real(KDDA) + test_dot_real(AVAZU) + test_dot_real(CRITEO) + test_dot_synthetic(SYNTHETIC1) + test_dot_synthetic(SYNTHETIC2) + total_time = time.time() - begin_time + print("total time is %f") % total_time diff --git a/benchmark/python/sparse/sparse_end2end.py b/benchmark/python/sparse/sparse_end2end.py new file mode 100644 index 000000000000..e9d8bf884713 --- /dev/null +++ b/benchmark/python/sparse/sparse_end2end.py @@ -0,0 +1,249 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from mxnet.test_utils import * +import time +import argparse +import os + +parser = argparse.ArgumentParser(description="Run sparse linear regression " \ + "with distributed kvstore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--profiler', type=int, default=0, + help='whether to use profiler') +parser.add_argument('--num-epoch', type=int, default=1, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=512, + help='number of examples per batch') +parser.add_argument('--num-batch', type=int, default=99999999, + help='number of batches per epoch') +parser.add_argument('--dummy-iter', type=int, default=0, + help='whether to use dummy iterator to exclude io cost') +parser.add_argument('--kvstore', type=str, default='local', + help='what kvstore to use [local, dist_sync, etc]') +parser.add_argument('--log-level', type=str, default='debug', + help='logging level [debug, info, error]') +parser.add_argument('--dataset', type=str, default='avazu', + help='what test dataset to use') +parser.add_argument('--num-gpu', type=int, default=0, + help='number of gpus to use. 0 means using cpu(0);' + 'otherwise, use gpu(0),...,gpu(num_gpu-1)') +parser.add_argument('--output-dim', type=int, default=4, + help='number of columns of the forward output') +parser.add_argument('--dummy-metric', type=int, default=0, + help='whether to call update_metric') + + +def get_libsvm_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.system("mkdir " + data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + import urllib + zippath = os.path.join(data_dir, data_origin_name) + urllib.urlretrieve(url, zippath) + os.system("bzip2 -d %r" % data_origin_name) + os.chdir("..") + + +class DummyIter(mx.io.DataIter): + "A dummy iterator that always return the same batch, used for speed testing" + def __init__(self, real_iter): + super(DummyIter, self).__init__() + self.real_iter = real_iter + self.provide_data = real_iter.provide_data + self.provide_label = real_iter.provide_label + self.batch_size = real_iter.batch_size + + for batch in real_iter: + self.the_batch = batch + break + + def __iter__(self): + return self + + def next(self): + return self.the_batch + +# testing dataset sources +avazu = { + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, +} + +kdda = { + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, +} + +datasets = { 'kdda' : kdda, 'avazu' : avazu } + + +def get_sym(feature_dim): + x = mx.symbol.Variable("data", stype='csr') + norm_init = mx.initializer.Normal(sigma=0.01) + w = mx.symbol.Variable("w", shape=(feature_dim, args.output_dim), init=norm_init, stype='row_sparse') + embed = mx.symbol.dot(x, w) + y = mx.symbol.Variable("softmax_label") + model = mx.symbol.SoftmaxOutput(data=embed, label=y, name="out") + return model + + +def row_sparse_pull(kv, key, data, slices, weight_array, priority): + # if have kvstore, need to pull corresponding rows of + # the weights to each context + # column indices (NDArray type) of the csr data + # used as the row_idx of the weight row-sparse matrix + row_indices = data.indices + if len(slices) == 1: + kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_indices) + else: # more than one slices, multi-GPU training. Need to retain weight rows according to data slices + # TODO(junwu): + # the following line blocks, may need to pre-compute + # and cache it outside the for loop + indptr = data.indptr.asnumpy() + row_idx_array = [] + for s in slices: + row_idx_array.append(row_indices[indptr[s.start]:indptr[s.stop]]) + kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_idx_array) + + +if __name__ == '__main__': + + # arg parser + args = parser.parse_args() + num_epoch = args.num_epoch + num_batch = args.num_batch + kvstore = args.kvstore + profiler = args.profiler > 0 + batch_size = args.batch_size if args.num_gpu == 0 else args.num_gpu * args.batch_size + dummy_iter = args.dummy_iter + dataset = args.dataset + log_level = args.log_level + contexts = mx.context.cpu(0) if args.num_gpu < 1\ + else [mx.context.gpu(i) for i in range(args.num_gpu)] + + # create kvstore when there are gpus + kv = mx.kvstore.create(kvstore) if args.num_gpu >= 1 else None + rank = kv.rank if kv is not None else 0 + num_worker = kv.num_workers if kv is not None else 1 + + # only print log for rank 0 worker + import logging + if rank != 0: + log_level = logging.ERROR + elif log_level == 'DEBUG': + log_level = logging.DEBUG + else: + log_level = logging.INFO + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=log_level, format=head) + + # dataset + assert(dataset in datasets), "unknown dataset " + dataset + metadata = datasets[dataset] + feature_dim = metadata['feature_dim'] + if logging: + logging.debug('preparing data ... ') + data_dir = os.path.join(os.getcwd(), 'data') + path = os.path.join(data_dir, metadata['data_name']) + if not os.path.exists(path): + get_libsvm_data(data_dir, metadata['data_name'], metadata['url'], + metadata['data_origin_name']) + assert os.path.exists(path) + + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,), + batch_size=batch_size, num_parts=num_worker, + part_index=rank) + if dummy_iter: + train_data = DummyIter(train_data) + + # model + model = get_sym(feature_dim) + + # module + mod = mx.mod.Module(symbol=model, data_names=['data'], + label_names=['softmax_label'], context=contexts) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params(initializer=mx.init.Uniform(scale=.1)) + sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0, + learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker) + mod.init_optimizer(optimizer=sgd, kvstore=kv) + # use accuracy as the metric + metric = mx.metric.create('acc') + + index = mod._exec_group.param_names.index('w') + # weight_array bound to executors of the contexts + weight_array = mod._exec_group.param_arrays[index] + + mx.nd.waitall() # sync point for initialization + # start profiler + if profiler: + device = 'cpu' + if args.num_gpu > 0: + device = 'gpu' + str(args.num_gpu) + name = 'profile_' + args.dataset + '_' + device + '_nworker' + str(num_worker)\ + + '_batchsize' + str(args.batch_size) + '_outdim' + str(args.output_dim) + '.json' + mx.profiler.profiler_set_config(mode='all', filename=name) + mx.profiler.profiler_set_state('run') + + logging.debug('start training ...') + start = time.time() + data_iter = iter(train_data) + for epoch in range(num_epoch): + nbatch = 0 + end_of_batch = False + data_iter.reset() + metric.reset() + next_batch = next(data_iter) + if kv is not None: + row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index) + while not end_of_batch: + nbatch += 1 + batch = next_batch + + mod.forward_backward(batch) + # update parameters + mod.update() + + try: + # pre fetch next batch + next_batch = next(data_iter) + if nbatch == num_batch: + raise StopIteration + if kv is not None: + row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index) + except StopIteration: + end_of_batch = True + # accumulate prediction accuracy + if args.dummy_metric == 0: + mod.update_metric(metric, batch.label) + else: # call waitall to replace update_metric as sync point + mx.nd.waitall() # sync point for the current minibatch + logging.info('epoch %d, %s' % (epoch, metric.get())) + if epoch == 0: + print "num_batches = ", nbatch + if profiler: + mx.profiler.profiler_set_state('stop') + end = time.time() + time_cost = end - start + logging.info('num_worker = ' + str(num_worker) + ', time cost = ' + str(time_cost)) diff --git a/benchmark/python/sparse/sparse_op.py b/benchmark/python/sparse/sparse_op.py new file mode 100644 index 000000000000..0683aa84eacb --- /dev/null +++ b/benchmark/python/sparse/sparse_op.py @@ -0,0 +1,245 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +from mxnet.test_utils import * +import scipy.sparse as sp +import os +import time +import argparse + +from mxnet.base import check_call, _LIB +from util import get_data, estimate_density + +parser = argparse.ArgumentParser(description="Benchmark sparse operators", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet') +args = parser.parse_args() + +# some data information +kdda = { + 'data_mini': 'kdda.t.mini', + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, + 'm': 200, + 'batch_size': [64] +} + +avazu = { + 'data_mini': 'avazu-app.t.mini', + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, + 'm': 500, + 'batch_size': [64, 128] +} + + +def measure_cost(repeat, f, *args, **kwargs): + # start bench + start = time.time() + results = [] + for i in range(repeat): + results.append(f(*args, **kwargs)) + for result in results: + result.wait_to_read() + end = time.time() + diff = end - start + return diff / repeat + + +def test_dot_real(data_dict): + def get_iter(path, data_shape, batch_size): + data_train = mx.io.LibSVMIter(data_libsvm=path, + data_shape=data_shape, + batch_size=batch_size) + data_iter = iter(data_train) + return data_iter + + data_dir = os.path.join(os.getcwd(), 'data') + + path = os.path.join(data_dir, data_dict['data_name']) + if not os.path.exists(path): + get_data( + data_dir, + data_dict['data_name'], + data_dict['url'], + data_dict['data_origin_name'] + ) + assert os.path.exists(path) + + k = data_dict['feature_dim'] + m = data_dict['m'] + density = estimate_density(path, data_dict['feature_dim']) + + mini_path = os.path.join(data_dir, data_dict['data_mini']) + if not os.path.exists(mini_path): + os.system("head -n 2000 %r > %r" % (path, mini_path)) + assert os.path.exists(mini_path) + + print "Running Benchmarking on %r data" % data_dict['data_mini'] + for batch_size in data_dict['batch_size']: # iterator through different batch size of choice + print "batch_size is %d" % batch_size + # model + data_shape = (k, ) + train_iter = get_iter(mini_path, data_shape, batch_size) + weight = mx.nd.random_uniform(low=0, high=1, shape=(k, m)) + + csr_data = [] + dns_data = [] + num_batch = 0 + for batch in train_iter: + data = train_iter.getdata() + csr_data.append(data) + dns_data.append(data.tostype('default')) + num_batch += 1 + bag_of_data = [csr_data, dns_data] + num_repeat = 5 + costs = [] + for d in bag_of_data: + weight.wait_to_read() + cost = 0. + count = 0 + for d_batch in d: + d_batch.wait_to_read() + cost += measure_cost(num_repeat, mx.nd.dot, d_batch, weight) + count += 1 + costs.append(cost/count) + t_sparse = costs[0] + t_dense = costs[1] + ratio = t_dense / t_sparse + print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse') + fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f" + print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse)) + + +def test_dot_synthetic(): + """benchmark mx.nd.dot(sparse_ndarray, dense_ndarray) with given density. + `t_sparse` is the time cost of dot(csr, dns), while `t_dense` is the time cost + of dot(dns, dns), with the same matrix except that it is in default storage type. + """ + def measure_cost_forward_baseline(repeat, dot, lhs, rhs): + start = time.time() + for i in range(repeat): + dot(lhs, rhs) + end = time.time() + diff = end - start + return diff / repeat + + def measure_cost_backward_baseline(repeat, dot, transpose, lhs, rhs): + start = time.time() + for i in range(repeat): + dot(transpose(lhs), rhs) + end = time.time() + diff = end - start + return diff / repeat + + def bench_dot_forward(m, k, n, density, ctx, repeat): + set_default_context(ctx) + dns = mx.nd.random_uniform(shape=(k, n)).copyto(ctx) + data_shape = (m, k) + csr_data = rand_ndarray(data_shape, 'csr', density) + dns_data = csr_data.tostype('default') + rhs_dns_np = dns.asnumpy() + lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy()) # csr in scipy + lhs_dns_np = lhs_csr_sp.tostype('default') + + data = [dns_data, csr_data] + costs = [] + for d in data: + dns.wait_to_read() + d.wait_to_read() + cost = measure_cost(repeat, mx.nd.dot, d, dns) + costs.append(cost) + ratio = costs[0] / costs[1] + + costs_baseline = [] + cost = measure_cost_forward_baseline(repeat, np.dot, lhs_dns_np, rhs_dns_np) + costs_baseline.append(cost) + cost = measure_cost_forward_baseline(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np) + costs_baseline.append(cost) + ratio_baseline = costs_baseline[0] / costs_baseline[1] + fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f" + print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1], + ratio_baseline, costs_baseline[0], costs_baseline[1])) + + def bench_dot_backward(m, k, n, density, ctx, repeat): + set_default_context(ctx) + dns = mx.nd.random_uniform(shape=(m, n)).copyto(ctx) + data_shape = (m, k) + csr_data = rand_ndarray(data_shape, 'csr', density) + dns_data = csr_data.tostype('default') + rhs_dns_np = dns.asnumpy() + lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy()) + lhs_dns_np = lhs_csr_sp.tostype('default') + + data = [dns_data, csr_data] + costs = [] + for d in data: + dns.wait_to_read() + d.wait_to_read() + cost = measure_cost(repeat, mx.nd.dot, d, dns, transpose_a=True) + costs.append(cost) + ratio = costs[0] / costs[1] + + costs_baseline = [] + cost = measure_cost_backward_baseline(repeat, np.dot, np.transpose, lhs_dns_np, rhs_dns_np) + costs_baseline.append(cost) + cost = measure_cost_backward_baseline(repeat, sp.spmatrix.dot, sp.spmatrix.transpose, lhs_csr_sp, rhs_dns_np) + costs_baseline.append(cost) + ratio_baseline = costs_baseline[0] / costs_baseline[1] + fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f" + print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1], + ratio_baseline, costs_baseline[0], costs_baseline[1])) + + print("A = sparse NDArray of shape(m, k)") + print("B = dense NDArray of shape(k, n)") + print("dot_forward\tdot(csr, dns)") + print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse' + '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse') + + check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads))) + # TODO(haibin) make these runtime options + m = 512 + k = [50000, 100000] + n = [64, 128] + density = [1.00, 0.90, 0.70, 0.50, 0.30, 0.20, 0.10, 0.07, 0.05, 0.02, 0.01, 0.005, 0.001] + num_repeat = 10 + # contexts = [mx.cpu(), mx.gpu(0)] + contexts = [mx.cpu()] + for i in range(2): + for ctx in contexts: + for den in density: + bench_dot_forward(m, k[i], n[i], den, ctx, num_repeat) + + print("dot_backward\tdot(csr.T, dns)") + print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse' + '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse') + for i in range(2): + for ctx in contexts: + for den in density: + bench_dot_backward(m, k[i], n[i], den, ctx, num_repeat) + + +if __name__ == "__main__": + test_dot_real(avazu) + test_dot_real(kdda) + test_dot_synthetic() diff --git a/benchmark/python/sparse/util.py b/benchmark/python/sparse/util.py new file mode 100644 index 000000000000..947ff4a65037 --- /dev/null +++ b/benchmark/python/sparse/util.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import random + + +def get_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.system("mkdir " + data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + import urllib + zippath = os.path.join(data_dir, data_origin_name) + urllib.urlretrieve(url, zippath) + os.system("bzip2 -d %r" % data_origin_name) + os.chdir("..") + + +def estimate_density(DATA_PATH, feature_size): + """sample 10 times of a size of 1000 for estimating the density of the sparse dataset""" + if not os.path.exists(DATA_PATH): + raise Exception("Data is not there!") + density = [] + P = 0.01 + for _ in xrange(10): + num_non_zero = 0 + num_sample = 0 + with open(DATA_PATH) as f: + for line in f: + if (random.random() < P): + num_non_zero += len(line.split(" ")) - 1 + num_sample += 1 + density.append(num_non_zero * 1.0 / (feature_size * num_sample)) + return sum(density) / len(density) + diff --git a/docs/api/python/ndarray.md b/docs/api/python/ndarray.md index 5e9f7e1a1184..3f2cef24a73a 100644 --- a/docs/api/python/ndarray.md +++ b/docs/api/python/ndarray.md @@ -64,9 +64,21 @@ A detailed tutorial is available at ``` In the rest of this document, we first overview the methods provided by the -`ndarray.NDArray` class, and then list other routines provided by the -`ndarray` package. +`ndarray.NDArray` class and its subclasses, and then list other routines +provided by the `ndarray` package. +The `ndarray` package provides several classes: + +```eval_rst +.. autosummary:: + :nosignatures: + + NDArray + sparse.CSRNDArray + sparse.RowSparseNDArray +``` + +We summarize the interface for each class in the following sections. ## The `NDArray` class @@ -80,6 +92,7 @@ In the rest of this document, we first overview the methods provided by the NDArray.size NDArray.context NDArray.dtype + NDArray.stype ``` ### Array conversion @@ -94,6 +107,7 @@ In the rest of this document, we first overview the methods provided by the NDArray.asnumpy NDArray.asscalar NDArray.astype + NDArray.tostype ``` ### Array change shape @@ -171,6 +185,35 @@ In the rest of this document, we first overview the methods provided by the NDArray.wait_to_read ``` +## The `sparse.RowSparseNDArray` Class + +```eval_rst +.. autosummary:: + :nosignatures: + + sparse.RowSparseNDArray.copyto + sparse.RowSparseNDArray.tostype + sparse.RowSparseNDArray.__setitem__ + sparse.RowSparseNDArray.__getitem__ + sparse.RowSparseNDArray.data + sparse.RowSparseNDArray.indices +``` + +## The `sparse.CSRNDArray` Class + +```eval_rst +.. autosummary:: + :nosignatures: + + sparse.CSRNDArray.copyto + sparse.CSRNDArray.tostype + sparse.CSRNDArray.__setitem__ + sparse.CSRNDArray.__getitem__ + sparse.CSRNDArray.data + sparse.CSRNDArray.indices + sparse.CSRNDArray.indptr +``` + ## Array creation routines ```eval_rst @@ -499,8 +542,24 @@ The `contrib.ndarray` module contains many useful experimental APIs for new feat ```eval_rst + +.. autoclass:: mxnet.ndarray.NDArray + :members: + :special-members: + +.. autoclass:: mxnet.ndarray.sparse.CSRNDArray + :members: + :special-members: + +.. autoclass:: mxnet.ndarray.sparse.RowSparseNDArray + :members: + :special-members: + .. automodule:: mxnet.ndarray :members: + :imported-members: + :special-members: + :exclude-members: CachedOp, BaseSparseNDArray, NDArray, CSRNDArray, RowSparseNDArray .. automodule:: mxnet.random :members: diff --git a/example/sparse/get_data.py b/example/sparse/get_data.py new file mode 100644 index 000000000000..578cf2ce5226 --- /dev/null +++ b/example/sparse/get_data.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import os, gzip +import pickle as pickle +import sys + +def get_libsvm_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.mkdir(data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + import urllib + zippath = os.path.join(data_dir, data_origin_name) + urllib.urlretrieve(url, zippath) + os.system("bzip2 -d %r" % data_origin_name) + os.chdir("..") diff --git a/example/sparse/linear_classification.py b/example/sparse/linear_classification.py new file mode 100644 index 000000000000..567568c6eb80 --- /dev/null +++ b/example/sparse/linear_classification.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet.test_utils import * +from get_data import get_libsvm_data +import time +import argparse +import os + +parser = argparse.ArgumentParser(description="Run sparse linear classification " \ + "with distributed kvstore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--profiler', type=int, default=0, + help='whether to use profiler') +parser.add_argument('--num-epoch', type=int, default=1, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=8192, + help='number of examples per batch') +parser.add_argument('--num-batch', type=int, default=99999999, + help='number of batches per epoch') +parser.add_argument('--dummy-iter', type=int, default=0, + help='whether to use dummy iterator to exclude io cost') +parser.add_argument('--kvstore', type=str, default='dist_sync', + help='what kvstore to use [local, dist_sync, etc]') +parser.add_argument('--log-level', type=str, default='DEBUG', + help='logging level [debug, info, error]') +parser.add_argument('--dataset', type=str, default='avazu', + help='what test dataset to use') + +class DummyIter(mx.io.DataIter): + "A dummy iterator that always return the same batch, used for speed testing" + def __init__(self, real_iter): + super(DummyIter, self).__init__() + self.real_iter = real_iter + self.provide_data = real_iter.provide_data + self.provide_label = real_iter.provide_label + self.batch_size = real_iter.batch_size + + for batch in real_iter: + self.the_batch = batch + break + + def __iter__(self): + return self + + def next(self): + return self.the_batch + +# testing dataset sources +avazu = { + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000000, +} + +kdda = { + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216830, +} + +datasets = { 'kdda' : kdda, 'avazu' : avazu } + +def linear_model(feature_dim): + x = mx.symbol.Variable("data", stype='csr') + norm_init = mx.initializer.Normal(sigma=0.01) + weight = mx.symbol.Variable("weight", shape=(feature_dim, 1), init=norm_init, stype='row_sparse') + bias = mx.symbol.Variable("bias", shape=(1,), init=norm_init) + dot = mx.symbol.dot(x, weight) + pred = mx.symbol.broadcast_add(dot, bias) + y = mx.symbol.Variable("softmax_label") + model = mx.symbol.SoftmaxOutput(data=pred, label=y, name="out") + return model + +if __name__ == '__main__': + # arg parser + args = parser.parse_args() + num_epoch = args.num_epoch + num_batch = args.num_batch + kvstore = args.kvstore + profiler = args.profiler > 0 + batch_size = args.batch_size + dummy_iter = args.dummy_iter + dataset = args.dataset + log_level = args.log_level + + # create kvstore + kv = mx.kvstore.create(kvstore) + rank = kv.rank + num_worker = kv.num_workers + + # only print log for rank 0 worker + import logging + if rank != 0: + log_level = logging.ERROR + elif log_level == 'DEBUG': + log_level = logging.DEBUG + else: + log_level = logging.INFO + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=log_level, format=head) + + # dataset + assert(dataset in datasets), "unknown dataset " + dataset + metadata = datasets[dataset] + feature_dim = metadata['feature_dim'] + if logging: + logging.debug('preparing data ... ') + data_dir = os.path.join(os.getcwd(), 'data') + path = os.path.join(data_dir, metadata['data_name']) + if not os.path.exists(path): + get_libsvm_data(data_dir, metadata['data_name'], metadata['url'], + metadata['data_origin_name']) + assert os.path.exists(path) + + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,), + batch_size=batch_size, num_parts=num_worker, + part_index=rank) + if dummy_iter: + train_data = DummyIter(train_data) + + # model + model = linear_model(feature_dim) + + # module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['softmax_label']) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params(initializer=mx.init.Uniform(scale=.1)) + sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0, + learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker) + mod.init_optimizer(optimizer=sgd, kvstore=kv) + # use accuracy as the metric + metric = mx.metric.create('Accuracy') + + # start profiler + if profiler: + name = 'profile_output_' + str(num_worker) + '.json' + mx.profiler.profiler_set_config(mode='all', filename=name) + mx.profiler.profiler_set_state('run') + + logging.debug('start training ...') + start = time.time() + data_iter = iter(train_data) + for epoch in range(num_epoch): + nbatch = 0 + data_iter.reset() + metric.reset() + for batch in data_iter: + nbatch += 1 + row_ids = batch.data[0].indices + # pull sparse weight + index = mod._exec_group.param_names.index('weight') + kv.row_sparse_pull('weight', mod._exec_group.param_arrays[index], + priority=-index, row_ids=[row_ids]) + mod.forward_backward(batch) + # update parameters + mod.update() + # accumulate prediction accuracy + mod.update_metric(metric, batch.label) + if nbatch == num_batch: + break + logging.info('epoch %d, %s' % (epoch, metric.get())) + if profiler: + mx.profiler.profiler_set_state('stop') + end = time.time() + time_cost = end - start + logging.info('num_worker = ' + str(num_worker) + ', time cost = ' + str(time_cost)) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 2289354e8a5e..a43f73fe45ab 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -276,6 +276,38 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape, int delay_alloc, int dtype, NDArrayHandle *out); + + +/*! + * \brief create an empty sparse NDArray with specified shape and data type + * \param storage_type the storage type of the ndarray + * \param shape the pointer to the shape + * \param ndim the dimension of the shape + * \param dev_type device type, specify device we want to take + * \param dev_id the device id of the specific device + * \param delay_alloc whether to delay allocation until + * the narray is first mutated + * \param dtype data type of created array + * \param num_aux the number of aux data to support this ndarray + * \param aux_type data type of the aux data for the created array + * \param aux_ndims the dimension of the shapes of aux data + * \param aux_shape the shapes of aux data + * \param out the returning handle + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type, + const mx_uint *shape, + mx_uint ndim, + int dev_type, + int dev_id, + int delay_alloc, + int dtype, + mx_uint num_aux, + int *aux_type, + mx_uint *aux_ndims, + const mx_uint *aux_shape, + NDArrayHandle *out); + /*! * \brief create a NDArray handle that is loaded from raw bytes. * \param buf the head of the raw bytes @@ -350,6 +382,17 @@ MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle, MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle, void *data, size_t size); +/*! + * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0 + * This function blocks. Do not use it in performance critical code. + * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated + * \param handle_src handle of a src ndarray which has default storage type + * \param i dst data blob indicator + */ +MXNET_DLL int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst, + const NDArrayHandle handle_src, + const int i); + /*! * \brief Wait until all the pending writes with respect NDArray are finished. * Always call this before read data out synchronizely. @@ -388,6 +431,7 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle, mx_uint slice_begin, mx_uint slice_end, NDArrayHandle *out); + /*! * \brief Index the NDArray along axis 0. * \param handle the handle to the NDArray @@ -398,6 +442,13 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle, MXNET_DLL int MXNDArrayAt(NDArrayHandle handle, mx_uint idx, NDArrayHandle *out); + +/*! + * \brief get the storage type of the array + */ +MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle, + int *out_storage_type); + /*! * \brief Reshape the NDArray. * \param handle the handle to the narray @@ -436,6 +487,34 @@ MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle, */ MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle, int *out_dtype); + +/*! + * \brief get the type of the ith aux data in NDArray + * \param handle the handle to the narray + * \param i the index of the aux data + * \param out_type pointer holder to get type of aux data + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle, + mx_uint i, + int *out_type); + +/*! + * \brief Get a deep copy of the ith aux data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle, + mx_uint i, + NDArrayHandle *out); + +/*! + * \brief Get a deep copy of the data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle, + NDArrayHandle *out); /*! * \brief get the context of the NDArray * \param handle the handle to the narray @@ -581,6 +660,28 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator, int num_params, const char **param_keys, const char **param_vals); +/*! + * \brief invoke a nnvm op and imperative function + * \param creator the op + * \param num_inputs number of input NDArrays + * \param inputs input NDArrays + * \param num_outputs number of output NDArrays + * \param outputs output NDArrays + * \param num_params number of keyword parameters + * \param param_keys keys for keyword parameters + * \param param_vals values for keyword parameters + * \param out_stypes output ndarrays' stypes + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXImperativeInvokeEx(AtomicSymbolCreator creator, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + int num_params, + const char **param_keys, + const char **param_vals, + const int **out_stypes); /*! * \brief set whether to record operator for autograd * \param is_recording 1 when recording, 0 when not recording. @@ -666,6 +767,30 @@ MXNET_DLL int MXCreateCachedOp(SymbolHandle handle, * \brief free cached operator */ MXNET_DLL int MXFreeCachedOp(CachedOpHandle handle); +/*! + * \brief invoke cached operator + */ +MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs); +/*! + * \brief invoke a cached op + * \param handle the handle to the cached op + * \param num_inputs number of input NDArrays + * \param inputs input NDArrays + * \param num_outputs number of output NDArrays + * \param outputs output NDArrays + * \param out_stypes output ndarrays' stypes + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + const int** out_stypes); /*! * \brief invoke cached operator */ @@ -1017,20 +1142,20 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym, * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym, - mx_uint num_args, - const char** keys, - const mx_uint *arg_ind_ptr, - const mx_uint *arg_shape_data, - mx_uint *in_shape_size, - const mx_uint **in_shape_ndim, - const mx_uint ***in_shape_data, - mx_uint *out_shape_size, - const mx_uint **out_shape_ndim, - const mx_uint ***out_shape_data, - mx_uint *aux_shape_size, - const mx_uint **aux_shape_ndim, - const mx_uint ***aux_shape_data, - int *complete); + mx_uint num_args, + const char** keys, + const mx_uint *arg_ind_ptr, + const mx_uint *arg_shape_data, + mx_uint *in_shape_size, + const mx_uint **in_shape_ndim, + const mx_uint ***in_shape_data, + mx_uint *out_shape_size, + const mx_uint **out_shape_ndim, + const mx_uint ***out_shape_data, + mx_uint *aux_shape_size, + const mx_uint **aux_shape_ndim, + const mx_uint ***aux_shape_data, + int *complete); /*! * \brief infer type of unknown input types given the known one. @@ -1061,6 +1186,10 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym, mx_uint *aux_type_size, const int **aux_type_data, int *complete); + + + + //-------------------------------------------- // Part 4: Executor interface //-------------------------------------------- @@ -1222,36 +1351,39 @@ MXNET_DLL int MXExecutorBindEX(SymbolHandle symbol_handle, ExecutorHandle *out); MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle, - int dev_type, - int dev_id, - const mx_uint num_g2c_keys, - const char** g2c_keys, - const int* g2c_dev_types, - const int* g2c_dev_ids, - const mx_uint provided_grad_req_list_len, - const char** provided_grad_req_names, - const char** provided_grad_req_types, - const mx_uint num_provided_arg_shapes, - const char** provided_arg_shape_names, - const mx_uint* provided_arg_shape_data, - const mx_uint* provided_arg_shape_idx, - const mx_uint num_provided_arg_dtypes, - const char** provided_arg_dtype_names, - const int* provided_arg_dtypes, - const mx_uint num_shared_arg_names, - const char** shared_arg_name_list, - int* shared_buffer_len, - const char** shared_buffer_name_list, - NDArrayHandle* shared_buffer_handle_list, - const char*** updated_shared_buffer_name_list, - NDArrayHandle** updated_shared_buffer_handle_list, - mx_uint* num_in_args, - NDArrayHandle** in_args, - NDArrayHandle** arg_grads, - mx_uint* num_aux_states, - NDArrayHandle** aux_states, - ExecutorHandle shared_exec_handle, - ExecutorHandle* out); + int dev_type, + int dev_id, + const mx_uint num_g2c_keys, + const char** g2c_keys, + const int* g2c_dev_types, + const int* g2c_dev_ids, + const mx_uint provided_grad_req_list_len, + const char** provided_grad_req_names, + const char** provided_grad_req_types, + const mx_uint num_provided_arg_shapes, + const char** provided_arg_shape_names, + const mx_uint* provided_arg_shape_data, + const mx_uint* provided_arg_shape_idx, + const mx_uint num_provided_arg_dtypes, + const char** provided_arg_dtype_names, + const int* provided_arg_dtypes, + const mx_uint num_provided_arg_stypes, + const char** provided_arg_stype_names, + const int* provided_arg_stypes, + const mx_uint num_shared_arg_names, + const char** shared_arg_name_list, + int* shared_buffer_len, + const char** shared_buffer_name_list, + NDArrayHandle* shared_buffer_handle_list, + const char*** updated_shared_buffer_name_list, + NDArrayHandle** updated_shared_buffer_handle_list, + mx_uint* num_in_args, + NDArrayHandle** in_args, + NDArrayHandle** arg_grads, + mx_uint* num_aux_states, + NDArrayHandle** aux_states, + ExecutorHandle shared_exec_handle, + ExecutorHandle* out); /*! * \brief set a call back to notify the completion of operation */ @@ -1468,6 +1600,26 @@ MXNET_DLL int MXKVStorePullEx(KVStoreHandle handle, const char** keys, NDArrayHandle* vals, int priority); + +/*! + * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string. + * The NDArray pulled back will be in row_sparse storage with only the specified + * row_ids present based row_ids (others rows are zeros). + * \param handle handle to the kvstore + * \param num the number of key-value pairs + * \param keys the list of keys + * \param vals the list of values + * \param row_ids the list of row_id NDArrays + * \param priority the priority of the action + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXKVStorePullRowSparse(KVStoreHandle handle, + mx_uint num, + const char** keys, + NDArrayHandle* vals, + const NDArrayHandle* row_ids, + int priority); + /*! * \brief user-defined updater for the kvstore * It's this updater's responsibility to delete \a recv and \a local diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h index a74d3b07b5be..85d34778dd8c 100644 --- a/include/mxnet/executor.h +++ b/include/mxnet/executor.h @@ -133,6 +133,7 @@ class Executor { const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& param_names, std::vector* in_args, diff --git a/include/mxnet/graph_attr_types.h b/include/mxnet/graph_attr_types.h new file mode 100644 index 000000000000..3aba0119d8ca --- /dev/null +++ b/include/mxnet/graph_attr_types.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file graph_attr_types.h + * \brief Data structures that can appear in graph attributes. + */ +#ifndef MXNET_GRAPH_ATTR_TYPES_H_ +#define MXNET_GRAPH_ATTR_TYPES_H_ + +#include + +namespace mxnet { + +/*! + * \brief The result holder of storage type of each NodeEntry in the graph. + * \note Stored under graph.attrs["storage_type"], provided by Pass "InferStorageType" + * + * \code + * Graph g = ApplyPass(src_graph, "InferStorageType"); + * const StorageVector& stypes = g.GetAttr("storage_type"); + * // get shape by entry id + * int entry_type = stypes[g.indexed_graph().entry_id(my_entry)]; + * \endcode + * + * \sa FInferStorageType + */ +using StorageTypeVector = std::vector; + +} // namespace mxnet + +#endif // MXNET_GRAPH_ATTR_TYPES_H_ diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index d2924ecea1b5..9ea63b4cec79 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -25,6 +25,7 @@ #define MXNET_KVSTORE_H_ #include #include +#include #include #include #include @@ -173,6 +174,29 @@ class KVStore { const std::vector& values, int priority = 0) = 0; + /*! + * \brief pull a list of key-value pairs from the store. + * The NDArray pulled back will be in row_sparse storage with only the + * specified row_ids present (others rows are zeros). + * \param keys the list of keys + * \param values the list of buffers - row_id pairs + * \param priority the priority of the action. + */ + virtual void PullRowSparse(const std::vector& str_keys, + const std::vector>& val_rowids, + const int priority = 0) = 0; + + /*! + * \brief pull a list of key-value pairs from the store, where each key is a string. + * The NDArray pulled back will be in row_sparse storage with only the + * specified row_ids present (others rows are zeros). + * \param keys the list of keys in string format + * \param values the list of buffers - row_id pairs + * \param priority the priority of the action. + */ + virtual void PullRowSparse(const std::vector& str_keys, + const std::vector>& val_rowids, + const int priority = 0) = 0; /** * \brief the prototype of user-defined updater diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index d7dff4098b27..754bc28e7bed 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -47,7 +47,6 @@ namespace mxnet { -// forward declaration namespace autograd { class AGNode; @@ -71,6 +70,23 @@ class AGNodeEntry { class AutogradRuntime; } // namespace autograd +// enum for storage types +namespace csr { +enum CSRAuxType {kIndPtr, kIdx}; +} + +namespace rowsparse { +enum RowSparseAuxType {kIdx}; +} + +enum NDArrayStorageType { + kUndefinedStorage = -1, // undefined storage + kDefaultStorage, // dense + kRowSparseStorage, // row sparse + kCSRStorage, // csr +}; + + /*! * \brief ndarray interface */ @@ -91,10 +107,55 @@ class NDArray { */ NDArray(const TShape &shape, Context ctx, bool delay_alloc = false, int dtype = mshadow::default_type_flag) - : ptr_(std::make_shared(shape.Size(), ctx, delay_alloc, dtype)), + : ptr_(std::make_shared(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) { #if MKL_EXPERIMENTAL == 1 Mkl_mem_ = std::make_shared(); +#endif + } + /*! \brief constructor for NDArray with storage type + */ + NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, + bool delay_alloc = true, int dtype = mshadow::default_type_flag, + std::vector aux_types = {}, std::vector aux_shapes = {}, + TShape storage_shape = TShape(mshadow::Shape1(0))) + : shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) { + // Assign default aux types if not given + if (aux_types.size() == 0) { + if (stype == kRowSparseStorage) { + aux_types = {mshadow::kInt64}; + } else if (stype == kCSRStorage) { + aux_types = {mshadow::kInt64, mshadow::kInt64}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0) { + if (stype == kRowSparseStorage) { + aux_shapes = {TShape(mshadow::Shape1(0))}; + } else if (stype == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (storage_shape.Size() == 0) { + if (stype == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (stype == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); #endif } /*! @@ -111,17 +172,86 @@ class NDArray { Mkl_mem_ = std::make_shared(); #endif } + /*! - * \return the shape of current NDArray + * \brief constructing a static NDArray of non-default storage that shares data with TBlob + * Use with caution: allocate ONLY ONE NDArray for each TBlob, + * make sure the memory region is available through out the life of NDArray + * \param stype the storage type of NDArray + * \param shape the shape of NDArray + * \param data the memory content of static data + * \param aux_data the memory content of static aux data + * \param dev_id the device id this tensor sits at + */ + NDArray(const NDArrayStorageType stype, const TShape &shape, + const TBlob &data, const std::vector &aux_data, int dev_id) + : ptr_(std::make_shared(stype, data, aux_data, dev_id)), shape_(shape), + dtype_(data.type_flag_), entry_({nullptr, 0, 0}) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif + } + + + /*! + * \return the shape of current NDArray. */ inline const TShape& shape() const { return shape_; } + /*! + * \return the shape of underlying chunk which stores the NDArray data/value. + * It is only intended for non-default storage. For row-sparse storage, it is the shape of + * the tensor which stores the non-zero values. + */ + inline const TShape &storage_shape() const { + CHECK(ptr_ != nullptr); + CHECK_NE(storage_type(), kDefaultStorage) + << "storage_shape() is not intended for kDefaultStorage."; + return ptr_->storage_shape; + } + + /*! + * \brief get the shape of aux_data(index) + * \param index the index of the aux data + * \return the shape of aux data at given index + */ + inline const TShape& aux_shape(size_t index) const { + CHECK_NE(storage_type(), kDefaultStorage) + << "aux_shape() is not intended for kDefaultStorage."; + return ptr_->aux_shapes[index]; + } + + /* \return the shapes of all aux data */ + const std::vector& aux_shapes() const { + CHECK_NE(storage_type(), kDefaultStorage) + << "aux_shapes() is not intended for kDefaultStorage."; + return ptr_->aux_shapes; + } + + /*! returns the dtypes of all aux data */ + const std::vector& aux_types() const { + CHECK_NE(storage_type(), kDefaultStorage) + << "aux_types() is not intended for kDefaultStorage."; + return ptr_->aux_types; + } + + /*! + * \brief For a sparse operation on a csr matrix for example, + * the size of the column index array + * is an estimated value in the beginning for allocating enough capacity + * for the final result. After the operation is done, the exact size of + * the shape is known and need to be reset using this function. + */ + inline void set_aux_shape(size_t index, const TShape& shape) const { + ptr_->set_aux_shape(index, shape); + } + /*! * \return the data TBlob */ inline const TBlob& data() const { - CheckAndAlloc(); + if (storage_type() == kDefaultStorage) CheckAndAlloc(); SetTBlob(); return tblob_; } @@ -129,6 +259,26 @@ class NDArray { * \return the gradient ndarray. */ NDArray grad() const; + + /*! + * \return the aux TBlob + */ + inline TBlob aux_data(size_t i) const { + auto stype = storage_type(); + TBlob res; + auto shape = aux_shape(i); + auto type = aux_type(i); + MSHADOW_TYPE_SWITCH(type, DType, { + auto dptr = static_cast(ptr_->aux_handles[i].dptr); + CHECK(stype == kRowSparseStorage || stype == kCSRStorage) + << "Unexpected storage type: " << stype; + res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); + }); +#if MKL_EXPERIMENTAL == 1 + res.Mkl_mem_ = Mkl_mem_; +#endif + return res; + } /*! * \return the context of NDArray, this function is only valid when the NDArray is not empty */ @@ -141,6 +291,15 @@ class NDArray { inline int dtype() const { return dtype_; } + inline int aux_type(size_t i) const { + CHECK(!is_none()); + return ptr_->aux_types[i]; + } + + inline NDArrayStorageType storage_type() const { + if (is_none()) return kUndefinedStorage; + return ptr_->storage_type; + } /*! \return whether this ndarray is not initialized */ inline bool is_none() const { return ptr_.get() == nullptr; @@ -149,6 +308,27 @@ class NDArray { bool fresh_out_grad() const; /*! \return updated grad state in entry_ */ void set_fresh_out_grad(bool state) const; + // returns true if a sparse ndarray's aux_data and storage are initialized + inline bool storage_initialized() const { + if (is_none()) return false; + auto stype = storage_type(); + CHECK_NE(stype, kDefaultStorage) + << "storage_initialized() is not intended for kDefaultStorage."; + if (stype == kRowSparseStorage) { + CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0]) + << "inconsistent storage shape " << storage_shape() + << " vs. aux shape " << aux_shape(rowsparse::kIdx); + return aux_shape(0).Size() != 0; + } else if (stype == kCSRStorage) { + CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0]) + << "inconsistent storage shape " << storage_shape() + << " vs. aux shape " << aux_shape(csr::kIdx); + return aux_shape(0).Size() != 0; + } else { + LOG(FATAL) << "Unknown storage type"; + } + return true; + } /*! * \brief Block until all the pending write operations with respect * to current NDArray are finished, and read can be performed. @@ -179,6 +359,12 @@ class NDArray { * \param strm the output stream */ void Save(dmlc::Stream *strm) const; + /*! + * \brief load ndarrays before supporting sparse ndarrays + * \param strm the output stream + * \param magic the magic number used for version control + */ + bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic); /*! * \brief load the content from binary stream * \param strm the output stream @@ -269,6 +455,12 @@ class NDArray { * \param size the size of the source array, in sizeof(DType) not raw btyes. */ void SyncCopyFromCPU(const void *data, size_t size) const; + + /*! + * \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j) + */ + void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1); + /*! * \brief Do a synchronize copy to a continugous CPU memory region. * @@ -282,17 +474,31 @@ class NDArray { void SyncCopyToCPU(void *data, size_t size) const; /*! * \brief Slice a NDArray - * \param begin begin index in first dim - * \param end end index in first dim + * \param begin begin index in first dim (inclusive) + * \param end end index in first dim (exclusive) * \return sliced NDArray */ NDArray Slice(index_t begin, index_t end) const; + /*! * \brief Index a NDArray * \param idx the index * \return idx-th sub array NDArray */ NDArray At(index_t idx) const; + + /*! + * \brief Generate a deep copy of aux_data(i) returned as + * a default storage type NDArray + */ + NDArray aux_ndarray(size_t i) const; + + /*! + * \brief Generate a deep copy of data() returned as a + * default storage type NDArray + */ + NDArray data_ndarray() const; + /*! * \brief Create a NDArray that shares memory with current one * The new array must have smaller memory size than the current array. @@ -301,6 +507,8 @@ class NDArray { * \return NDArray in new shape and type. */ inline NDArray AsArray(const TShape &shape, int dtype) const { + CHECK_EQ(storage_type(), kDefaultStorage) + << "AsArray is intended only for kDefaultStorage."; CHECK_GE(shape_.Size() * mshadow::mshadow_sizeof(dtype_), shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; @@ -342,8 +550,45 @@ class NDArray { * This is an internal function used by system that normal user should not use */ inline void CheckAndAlloc() const { + CHECK_EQ(storage_type(), kDefaultStorage); ptr_->CheckAndAlloc(); } + + /*! + * \brief Allocate the space if the allocation has been delayed + * or the requested size is bigger than the available one. + * This function can only be called by ndarray of default + * storage type and effectively changes the ndarray's shape_. + * Note: This function is named as this to avoid overload conflict + * with CheckAndAlloc(const std::vector &aux_shapes), since + * TShape tmp = some_shape is equivalent to TShape tmp = {some_shape}. + */ + void ReshapeAndAlloc(const TShape& shape) { + CHECK_EQ(storage_type(), kDefaultStorage); + CHECK(!is_none()); + shape_ = shape; + ptr_->CheckAndAlloc(shape.Size() * mshadow::mshadow_sizeof(dtype_)); + } + + /* ! + * \brief Alloc memory for non-default storage + * aux_shape is only known at run time + */ + inline void CheckAndAlloc(const std::vector &aux_shapes) const { + CHECK_NE(storage_type(), kDefaultStorage) + << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage"; + ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_); + } + inline void CheckAndAllocData(const TShape &storage_shape) const { + CHECK_NE(storage_type(), kDefaultStorage) + << "CheckAndAllocData is not intended for kDefaultStorage"; + ptr_->CheckAndAllocData(storage_shape, dtype_); + } + inline void CheckAndAllocAuxData(size_t i, const TShape &aux_shape) const { + CHECK_NE(storage_type(), kDefaultStorage) + << "CheckAndAllocAuxData is not intended for kDefaultStorage"; + ptr_->CheckAndAllocAuxData(i, aux_shape); + } /*! * \brief Save list of ndarray into the Stream.x * \param fo The stream of output. @@ -366,44 +611,138 @@ class NDArray { private: friend class autograd::AutogradRuntime; /*! \brief the real data chunk that backs NDArray */ + // shandle is used to store the actual values in the NDArray + // aux_handles store the aux data(such as indices) if it's needed by non-default storage. struct Chunk { - /*! \brief storage handlefrom storage engine */ + /*! \brief storage handle from storage engine. + for non-default storage, shandle stores the data(value) array. + */ Storage::Handle shandle; + /*! \brief storage handles for aux data (e.g index) + for row_sparse, aux_handles[0] = indices + for csr, aux_handles[0] = indptr, aux_handles[1] = indices + */ + std::vector aux_handles; /*! \brief variable from engine */ Engine::VarHandle var; /*! * \brief if this is true, this means the data do not come * from Storage, and do not need to be freed */ + /*! \brief construct from static data */ bool static_data; - /*! \brief whether allocation is delayed */ + /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data + allocation is delayed. */ bool delay_alloc; + // the type of the storage. The storage_type is never kUndefinedStorage once the chunk + // is constructed. + NDArrayStorageType storage_type = kDefaultStorage; + /*! \brief type of aux */ + std::vector aux_types; + // context of data + Context ctx; + // The shape of the chunk data. + // This might not be the same shape as the NDArray, since the storage may be sparse. + // The default value for storage_shape is {0} when an empty non-default NDArray is created. + TShape storage_shape; + // The shape of aux data. The default value for the shape depends on the type of storage. + // If aux_shapes[i].Size() is zero, aux data i is empty. + std::vector aux_shapes; + /*! \brief default cosntructor */ - Chunk() : static_data(true), delay_alloc(false) { - var = Engine::Get()->NewVariable(); + Chunk() : static_data(true), delay_alloc(false) {} + + /*! \brief construct a new chunk */ + Chunk(TShape shape, Context ctx_, bool delay_alloc_, int dtype) + : static_data(false), delay_alloc(true), ctx(ctx_) { + auto size = shape.Size(); + storage_shape = shape; + var = Engine::Get()->NewVariable(); + shandle.size = size * mshadow::mshadow_sizeof(dtype); + shandle.ctx = ctx_; + if (!delay_alloc_) this->CheckAndAlloc(); } - /*! \brief construct from static data */ + Chunk(const TBlob &data, int dev_id) - : static_data(true), - delay_alloc(false) { + : static_data(true), delay_alloc(false) { + CHECK(storage_type == kDefaultStorage); var = Engine::Get()->NewVariable(); if (data.dev_mask() == cpu::kDevMask) { - shandle.ctx = Context::CPU(); + ctx = Context::CPU(); } else { CHECK_EQ(data.dev_mask(), gpu::kDevMask); - shandle.ctx = Context::GPU(dev_id); + ctx = Context::GPU(dev_id); } + // init shandle + shandle.ctx = ctx; shandle.dptr = data.dptr_; shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_); + storage_shape = data.shape_; } - /*! \brief construct a new chunk */ - Chunk(uint64_t size, Context ctx, bool delay_alloc_, int dtype) - : static_data(false), delay_alloc(true) { + // Constructor for a non-default storage chunk + Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_, + bool delay_alloc_, int dtype, const std::vector &aux_types_, + const std::vector &aux_shapes_) + : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), + aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), + aux_shapes(aux_shapes_) { + shandle.ctx = ctx; var = Engine::Get()->NewVariable(); - shandle.size = size * mshadow::mshadow_sizeof(dtype); + // aux_handles always reflect the correct number of aux data + for (size_t i = 0; i < aux_shapes.size(); i++) { + CheckAndAllocAuxData(i, aux_shapes[i]); + // this line is needed in case when aux_shapes[i].Size() = 0 + // aux_handles[i] will not be updated and take only default value. + aux_handles[i].ctx = ctx; + } + if (!delay_alloc) { + CheckAndAllocData(storage_shape, dtype); + } + } + + Chunk(const NDArrayStorageType storage_type_, const TBlob &data, + const std::vector &aux_data, int dev_id) + : static_data(true), delay_alloc(false), storage_type(storage_type_) { + using namespace mshadow; + CHECK_NE(storage_type, kDefaultStorage); + // init var + var = Engine::Get()->NewVariable(); + // init ctx + if (data.dev_mask() == cpu::kDevMask) { + ctx = Context::CPU(); + } else { + CHECK_EQ(data.dev_mask(), gpu::kDevMask); + ctx = Context::GPU(dev_id); + } + // init shandle shandle.ctx = ctx; - if (!delay_alloc_) this->CheckAndAlloc(); + shandle.dptr = data.dptr_; + shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_); + storage_shape = data.shape_; + // init aux handles + for (const auto &aux : aux_data) { + Storage::Handle aux_handle; + aux_handle.ctx = ctx; + aux_handle.dptr = aux.dptr_; + aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_); + aux_handles.push_back(aux_handle); + aux_types.emplace_back(aux.type_flag_); + aux_shapes.emplace_back(aux.shape_); + } + } + + /*! \brief set the shape for ith aux data, and update storage shape if necessary */ + inline void set_aux_shape(const size_t i, const TShape& shape) { + aux_shapes[i] = shape; + if (storage_shape.ndim() > 0) { + if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) { + storage_shape[0] = shape[0]; + } else if (storage_type == kCSRStorage && i == csr::kIdx) { + storage_shape[0] = shape[0]; + } + } } + /*! \brief check if delay alloc is on, do alloc if not yet done */ inline void CheckAndAlloc(void) { if (delay_alloc) { @@ -411,22 +750,113 @@ class NDArray { delay_alloc = false; } } - /*! \brief destructor */ - ~Chunk() { - if (static_data || delay_alloc) { - Engine::Get()->DeleteVariable([](RunContext s) {}, shandle.ctx, var); + + /*! \brief Check and alloc memory for a dense ndarray */ + // size is the number of bytes + void CheckAndAlloc(uint64_t dbytes) { + CHECK_EQ(kDefaultStorage, storage_type) + << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage"; + if (delay_alloc) { + shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); + delay_alloc = false; + } else if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); + } + } + + inline void CheckAndAlloc(const TShape &shape, const std::vector &aux_shapes, + int dtype) { + // calculate size, perform allocation + if (kRowSparseStorage == storage_type) { + // For row sparse, aux_shape indicates the number of rows to allocate + auto aux_shape = aux_shapes[rowsparse::kIdx]; + CheckAndAllocAuxData(rowsparse::kIdx, aux_shape); + TShape storage_shape(shape); + storage_shape[0] = aux_shape[0]; + CheckAndAllocData(storage_shape, dtype); + } else if (kCSRStorage == storage_type) { + CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]); + CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]); + CheckAndAllocData(aux_shapes[csr::kIdx], dtype); } else { - Storage::Handle h = this->shandle; - Engine::Get()->DeleteVariable([h](RunContext s) { - Storage::Get()->Free(h); - }, shandle.ctx, var); + LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc"; + } + } + // create storage handle for data based on shape and dtype, assuming ctx is set + // storage shape is also updated + // if data is already allocated, try reuse the storage. Otherwise, free the current one + // and allocate new storage + inline void CheckAndAllocData(const TShape &shape, int dtype) { + CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; + } + // create storage handle for aux data based on shape + // this function assumes ctx, aux shapes and aux types are set + // aux shape is also updated + // if aux data is already allocated, try reuse the storage. Otherwise, free the current one + // and allocate new storage + inline void CheckAndAllocAuxData(size_t i, const TShape &shape) { + CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData"; + CHECK_NE(storage_type, kUndefinedStorage) + << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData"; + CHECK_NE(storage_type, kDefaultStorage) + << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData"; + if (aux_handles.size() <= i) { + aux_handles.resize(i + 1); + } + size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]); + if (aux_handles[i].size < aux_bytes) { + // free storage if necessary and alloc again + if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]); + // init aux storage + aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx); + } + // init shape + set_aux_shape(i, shape); + } + /*! \brief destructor */ + ~Chunk() { + bool skip_free = static_data || delay_alloc; + Storage::Handle h = this->shandle; + std::vector aux_h = this->aux_handles; + Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) { + if (skip_free == false) { + Storage::Get()->Free(h); + for (size_t i = 0; i < aux_h.size(); i++) { + if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]); + } + } + }, shandle.ctx, var); } - }; + }; // struct Chunk void SetTBlob() const { - tblob_.dptr_ = static_cast(ptr_->shandle.dptr) + byte_offset_; - tblob_.shape_ = shape_; + CHECK(ptr_ != nullptr); + TShape shape = shape_; + char *dptr = static_cast(ptr_->shandle.dptr); + auto stype = storage_type(); + if (stype == kDefaultStorage) { + dptr += byte_offset_; + } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + shape = storage_shape(); + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + tblob_.dptr_ = dptr; + tblob_.shape_ = shape; tblob_.type_flag_ = dtype_; tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); #if MKL_EXPERIMENTAL == 1 @@ -438,7 +868,7 @@ class NDArray { std::shared_ptr Mkl_mem_; #endif /*! \brief internal data of NDArray */ - std::shared_ptr ptr_; + std::shared_ptr ptr_{nullptr}; /*! \brief shape of current NDArray */ TShape shape_; /*! \brief byte offset in chunk */ @@ -455,7 +885,12 @@ class NDArray { * this situation. */ mutable TBlob tblob_; -}; +}; // class NDArray + +/*! + * \return the number of aux data used for given storage type + */ +size_t num_aux_data(NDArrayStorageType stype); /*! * \brief issue an copy operation from one NDArray to another @@ -470,7 +905,6 @@ class NDArray { */ void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); - /*! * \brief Perform elementwise sum over each data from source, store result into out. * \param source the ndarray we want to sum diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index 1bcae0d29348..f559a921c522 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -25,7 +25,6 @@ #ifndef MXNET_OP_ATTR_TYPES_H_ #define MXNET_OP_ATTR_TYPES_H_ - #include #include @@ -226,6 +225,23 @@ using FCompute = std::function& inputs, const std::vector& req, const std::vector& outputs)>; +/*! + * \brief Resiger an NDArray compute function for simple stateless forward only operator + * + * \note Register under "FComputeEx" and "FComputeEx" + * Dispatched only when operators process non-default storage inputs or outputs + */ +using FComputeEx = std::function& inputs, + const std::vector& req, + const std::vector& outputs)>; + +using FInferStorageType = std::function* in_attrs, + std::vector* out_attrs)>; + } // namespace mxnet #endif // MXNET_OP_ATTR_TYPES_H_ diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h index bfb42de8771a..7e3af8eeca81 100644 --- a/include/mxnet/storage.h +++ b/include/mxnet/storage.h @@ -41,11 +41,11 @@ class Storage { /*! * \brief Pointer to the data. */ - void* dptr; + void* dptr{nullptr}; /*! * \brief Size of the storage. */ - size_t size; + size_t size{0}; /*! * \brief Context information about device and ID. */ diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i index fd1a471bcf16..b4c1336de624 100644 --- a/perl-package/AI-MXNetCAPI/mxnet.i +++ b/perl-package/AI-MXNetCAPI/mxnet.i @@ -1203,6 +1203,12 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, const mx_uint num_provided_arg_dtypes, const char** in, // provided_arg_dtype_names, const int* in, // provided_arg_dtypes, + +//--------------- sparse related variables, ignored for now + const mx_uint num_provided_arg_stypes, + const char** provided_arg_stype_names, + const int* provided_arg_stypes, +//--------------- const mx_uint num_shared_arg_names, const char** in, // shared_arg_name_list, //------------ diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i index 640215fd7792..5d2fbd6880a1 100644 --- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i +++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i @@ -820,6 +820,17 @@ } } +%typemap(in,numinputs=0) (const mx_uint num_provided_arg_stypes, const char** provided_arg_stype_names, + const int* provided_arg_stypes) + (mx_uint temp1, char* temp2, int temp3) +{ + $2 = &temp2; + $3 = &temp3; + $1 = 0; + *$2 = NULL; + *$3 = 0; +} + %typemap(in,numinputs=0) (mx_uint* num_aux_states, NDArrayHandle** aux_states) (mx_uint temp1, diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py index 3c3ce76a9284..72dc2b2fec8d 100644 --- a/python/mxnet/__init__.py +++ b/python/mxnet/__init__.py @@ -26,6 +26,7 @@ from . import base from . import contrib from . import ndarray +from . import ndarray as nd from . import name # use mx.sym as short for symbol from . import symbol as sym @@ -34,8 +35,6 @@ from . import io from . import recordio from . import operator -# use mx.nd as short for mx.ndarray -from . import ndarray as nd # use mx.rnd as short for mx.random from . import random as rnd from . import random diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py index 5a50f80498ec..c2e6fce40de8 100644 --- a/python/mxnet/_ctypes/ndarray.py +++ b/python/mxnet/_ctypes/ndarray.py @@ -32,10 +32,19 @@ from ..ndarray_doc import _build_doc +_STORAGE_TYPE_ID_TO_STR = { + -1 : 'undefined', + 0 : 'default', + 1 : 'row_sparse', + 2 : 'csr', +} + + class NDArrayBase(object): """Base data structure for ndarray""" __slots__ = ["handle", "writable"] # pylint: disable= no-member + def __init__(self, handle, writable=True): """initialize a new NDArray @@ -78,7 +87,11 @@ def _imperative_invoke(handle, ndargs, keys, vals, out): output_vars = ctypes.POINTER(NDArrayHandle)() num_output = ctypes.c_int(0) - check_call(_LIB.MXImperativeInvoke( + # return output stypes to avoid the c_api call for checking + # a handle's stype in _ndarray_cls + out_stypes = ctypes.POINTER(ctypes.c_int)() + + check_call(_LIB.MXImperativeInvokeEx( ctypes.c_void_p(handle), ctypes.c_int(len(ndargs)), c_array(NDArrayHandle, [arr.handle for arr in ndargs]), @@ -86,14 +99,17 @@ def _imperative_invoke(handle, ndargs, keys, vals, out): ctypes.byref(output_vars), ctypes.c_int(len(keys)), c_array(ctypes.c_char_p, [c_str(key) for key in keys]), - c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals]))) + c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals]), + ctypes.byref(out_stypes))) if original_output is not None: return original_output if num_output.value == 1: - return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle)) + return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[0]]) else: - return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle)) + return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[i]]) for i in range(num_output.value)] @@ -128,17 +144,24 @@ def __call__(self, *args, **kwargs): "CachedOp.__call__ got unexpected keyword argument(s): " + \ ', '.join(kwargs.keys())) - check_call(_LIB.MXInvokeCachedOp( + # return output stypes to avoid the c_api call for checking + # a handle's stype in _ndarray_cls + out_stypes = ctypes.POINTER(ctypes.c_int)() + + check_call(_LIB.MXInvokeCachedOpEx( self.handle, ctypes.c_int(len(args)), c_array(NDArrayHandle, [arr.handle for arr in args]), ctypes.byref(num_output), - ctypes.byref(output_vars))) + ctypes.byref(output_vars), + ctypes.byref(out_stypes))) if original_output is not None: return original_output if num_output.value == 1: - return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle)) + return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[0]]) else: - return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle)) + return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), + stype=_STORAGE_TYPE_ID_TO_STR[out_stypes[i]]) for i in range(num_output.value)] diff --git a/python/mxnet/base.py b/python/mxnet/base.py index aad0580e7d07..d446355da0b5 100644 --- a/python/mxnet/base.py +++ b/python/mxnet/base.py @@ -72,6 +72,20 @@ def __str__(self): msg += ' is not implemented for Symbol and only available in NDArray.' return msg +class NotSupportedForSparseNDArray(MXNetError): + def __init__(self, function, alias, *args): + super(NotSupportedForSparseNDArray, self).__init__() + self.function = function.__name__ + self.alias = alias + self.args = [str(type(a)) for a in args] + def __str__(self): + msg = 'Function {}'.format(self.function) + if self.alias: + msg += ' (namely operator "{}")'.format(self.alias) + if self.args: + msg += ' with arguments ({})'.format(', '.join(self.args)) + msg += ' is not supported for SparseNDArray and only available in NDArray.' + return msg class MXCallbackList(ctypes.Structure): """Structure that holds Callback information. Passed to CustomOpProp.""" diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py index c7fb6e17803a..2d2500e7a217 100644 --- a/python/mxnet/contrib/autograd.py +++ b/python/mxnet/contrib/autograd.py @@ -24,6 +24,7 @@ import functools from ..base import _LIB, check_call, string_types from ..base import mx_uint, NDArrayHandle, c_array +# pylint: disable= unused-import from ..ndarray import NDArray, zeros_like from ..symbol import _GRAD_REQ_MAP diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py index baff834bb33a..5cc94a5e80ac 100644 --- a/python/mxnet/executor.py +++ b/python/mxnet/executor.py @@ -27,6 +27,7 @@ from .base import mx_uint, NDArrayHandle, ExecutorHandle from .base import check_call, c_array, py_str from .ndarray import NDArray +from .ndarray import _ndarray_cls from . import ndarray as nd # those functions are not used here, we just import them to keep backward compatibility @@ -105,7 +106,9 @@ def _get_outputs(self): handles = ctypes.POINTER(NDArrayHandle)() check_call(_LIB.MXExecutorOutputs(self.handle, ctypes.byref(out_size), ctypes.byref(handles))) - return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)] + num_output = out_size.value + outputs = [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(num_output)] + return outputs def forward(self, is_train=False, **kwargs): """Calculate the outputs specified by the bound symbol. diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py index 8ac1aebe72dd..f67b05de5de3 100644 --- a/python/mxnet/image/detection.py +++ b/python/mxnet/image/detection.py @@ -27,7 +27,7 @@ from ..base import numeric_types from .. import ndarray as nd -from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder +from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder from .. import io from .image import RandomOrderAug, ColorJitterAug, LightingAug, ColorNormalizeAug from .image import ResizeAug, ForceResizeAug, CastAug, HueJitterAug, RandomGrayAug diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py index 2e40019971ac..d99db214222c 100644 --- a/python/mxnet/image/image.py +++ b/python/mxnet/image/image.py @@ -34,9 +34,9 @@ from ..base import numeric_types from .. import ndarray as nd -from .. import _ndarray_internal as _internal -from .._ndarray_internal import _cvimresize as imresize -from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder +from ..ndarray import _internal +from ..ndarray._internal import _cvimresize as imresize +from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder from .. import io from .. import recordio diff --git a/python/mxnet/io.py b/python/mxnet/io.py index 0404e34ea36c..4e69a8a801cb 100644 --- a/python/mxnet/io.py +++ b/python/mxnet/io.py @@ -34,6 +34,7 @@ from .base import mx_real_t from .base import check_call, build_param_doc as _build_param_doc from .ndarray import NDArray +from .ndarray import _ndarray_cls from .ndarray import array from .ndarray import concatenate @@ -801,12 +802,12 @@ def iter_next(self): def getdata(self): hdl = NDArrayHandle() check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl))) - return NDArray(hdl, False) + return _ndarray_cls(hdl, False) def getlabel(self): hdl = NDArrayHandle() check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl))) - return NDArray(hdl, False) + return _ndarray_cls(hdl, False) def getindex(self): index_size = ctypes.c_uint64(0) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index fd0091182aea..2af70e36e60a 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -22,6 +22,7 @@ import ctypes import pickle from .ndarray import NDArray +from .ndarray import _ndarray_cls from .base import _LIB from .base import check_call, c_array, c_str, string_types, mx_uint, py_str from .base import NDArrayHandle, KVStoreHandle @@ -53,8 +54,8 @@ def _updater_wrapper(updater): """A wrapper for the user-defined handle.""" def updater_handle(key, lhs_handle, rhs_handle, _): """ ctypes function """ - lhs = NDArray(NDArrayHandle(lhs_handle)) - rhs = NDArray(NDArrayHandle(rhs_handle)) + lhs = _ndarray_cls(NDArrayHandle(lhs_handle)) + rhs = _ndarray_cls(NDArrayHandle(rhs_handle)) updater(key, lhs, rhs) return updater_handle @@ -186,6 +187,8 @@ def pull(self, key, out=None, priority=0): The returned values are gauranteed to be the latest values in the store. + For row_sparse values, please use `row_sparse_pull` instead. + Parameters ---------- key : int or list of int @@ -236,6 +239,66 @@ def pull(self, key, out=None, priority=0): self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority))) + def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): + """ Pulls a single row_sparse value or a sequence of row_sparse values from the store + with specified row_ids. + + `row_sparse_pull` is executed asynchronously after all previous + `push`/`pull`/`row_sparse_pull` calls for the same input key(s) are finished. + + The returned values are guaranteed to be the latest values in the store. + + Parameters + ---------- + key : str or list of str + Keys. + + out: NDArray or list of NDArray or list of list of NDArray + Values corresponding to the keys. The stype is expected to be row_sparse + + priority : int, optional + The priority of the pull operation. + Higher priority pull operations are likely to be executed before + other pull actions. + + row_ids : NDArray or list of NDArray + The row_ids for which to pull for each value. Each row_id is an 1D-NDArray \ + whose values don't have to be unique nor sorted. + + Examples + -------- + >>> shape = (3, 3) + >>> kv.init('3', mx.nd.ones(shape).tostype('row_sparse')) + >>> a = mx.nd.zeros(shape, stype='row_sparse') + >>> row_ids = mx.nd.array([0, 2], dtype='int64') + >>> kv.row_sparse_pull('3', out=a, row_ids=row_ids) + >>> print a.asnumpy() + [[ 1. 1. 1.] + [ 0. 0. 0.] + [ 1. 1. 1.]] + >>> duplicate_row_ids = mx.nd.array([2, 2], dtype='int64') + >>> kv.row_sparse_pull('3', out=a, row_ids=duplicate_row_ids) + >>> print a.asnumpy() + [[ 0. 0. 0.] + [ 0. 0. 0.] + [ 1. 1. 1.]] + >>> unsorted_row_ids = mx.nd.array([1, 0], dtype='int64') + >>> kv.row_sparse_pull('3', out=a, row_ids=unsorted_row_ids) + >>> print a.asnumpy() + [[ 1. 1. 1.] + [ 1. 1. 1.] + [ 0. 0. 0.]] + """ + assert(out is not None) + assert(row_ids is not None) + ckeys, cvals = _ctype_key_value(key, out) + _, crow_ids = _ctype_key_value(key, row_ids) + assert(len(crow_ids) == len(cvals)), "number of row_ids doesn't match number of values" + + check_call(_LIB.MXKVStorePullRowSparse( + self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) + + def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 01b3fa50e18f..2444ca0dc59e 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -93,8 +93,7 @@ def _create_kvstore(kvstore, num_device, arg_params): return (kv, update_on_kvstore) -def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, - update_on_kvstore): +def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore): """Initialize kvstore""" for idx, param_on_devs in enumerate(param_arrays): name = param_names[idx] @@ -118,10 +117,11 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): def _update_params(param_arrays, grad_arrays, updater, num_device, kvstore=None, param_names=None): """Perform update of param_arrays from grad_arrays not on kvstore.""" - for index, pair in enumerate(zip(param_arrays, grad_arrays)): + for i, pair in enumerate(zip(param_arrays, grad_arrays)): arg_list, grad_list = pair if grad_list[0] is None: continue + index = i if kvstore: name = param_names[index] # push gradient, priority is negative index @@ -131,7 +131,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device, for k, p in enumerate(zip(arg_list, grad_list)): # faked an index here, to make optimizer create diff # state for the same index but on diff devs, TODO(mli) - # use a better solution latter + # use a better solution later w, g = p updater(index*num_device+k, g, w) diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py index 3123462f9c7c..bae166e3ffd8 100644 --- a/python/mxnet/module/base_module.py +++ b/python/mxnet/module/base_module.py @@ -957,7 +957,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): - """Installs and initializes optimizers. + """Installs and initializes optimizers, as well as initialize kvstore for + distributed training Parameters ---------- diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 058edd57eb3d..d55b2117ebd3 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -25,7 +25,6 @@ import warnings from .. import context as ctx -from .. import ndarray as nd from .. import optimizer as opt from .executor_group import DataParallelExecutorGroup @@ -33,6 +32,7 @@ from ..model import load_checkpoint from ..initializer import Uniform, InitDesc from ..io import DataDesc +from ..ndarray import zeros from .base_module import BaseModule, _check_input_names, _parse_data_desc @@ -427,13 +427,13 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, else: assert self._arg_params is None and self._aux_params is None param_arrays = [ - nd.zeros(x[0].shape, dtype=x[0].dtype) + zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype) for x in self._exec_group.param_arrays ] self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)} aux_arrays = [ - nd.zeros(x[0].shape, dtype=x[0].dtype) + zeros(x[0].shape, dtype=x[0].dtype) for x in self._exec_group.aux_arrays ] self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)} @@ -441,7 +441,6 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, if shared_module is not None and shared_module.optimizer_initialized: self.borrow_optimizer(shared_module) - def reshape(self, data_shapes, label_shapes=None): """Reshapes the module for new input shapes. @@ -483,6 +482,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', if self._params_dirty: self._sync_params_from_devices() + (kvstore, update_on_kvstore) = \ _create_kvstore(kvstore, len(self._context), self._arg_params) diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py new file mode 100644 index 000000000000..63220787a43c --- /dev/null +++ b/python/mxnet/ndarray/__init__.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""NDArray API of MXNet.""" + +from . import _internal, sparse, op +from .op import CachedOp +# pylint: disable=wildcard-import, redefined-builtin +from .ndarray import * +from .utils import load, save, zeros, empty, array +from .sparse import _ndarray_cls diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/ndarray/_internal.py similarity index 100% rename from python/mxnet/_ndarray_internal.py rename to python/mxnet/ndarray/_internal.py diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray/ndarray.py similarity index 87% rename from python/mxnet/ndarray.py rename to python/mxnet/ndarray/ndarray.py index 42f0ff5e87cf..20ca2262f0cd 100644 --- a/python/mxnet/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -21,6 +21,7 @@ """NDArray API of MXNet.""" from __future__ import absolute_import from __future__ import division + try: from __builtin__ import slice as py_slice except ImportError: @@ -28,40 +29,25 @@ import ctypes import warnings - -import os as _os -import sys as _sys - import operator import numpy as np -from .base import _LIB, string_types, numeric_types, integer_types -from .base import c_array, py_str, c_str, mx_real_t, _Null # pylint: disable=unused-import -from .base import mx_uint, NDArrayHandle, check_call, OpHandle -from .base import ctypes2buffer -from .context import Context -from . import _ndarray_internal as _internal -from .ndarray_doc import _build_doc - - -# Use different version of SymbolBase -# When possible, use cython to speedup part of computation. -# pylint: disable=unused-import -try: - if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: - from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class - from ._ctypes.ndarray import CachedOp, _imperative_invoke - elif _sys.version_info >= (3, 0): - from ._cy3.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke - from ._cy3.ndarray import CachedOp, _imperative_invoke - else: - from ._cy2.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke - from ._cy2.ndarray import CachedOp, _imperative_invoke -except ImportError: - if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: - raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") - from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke - from ._ctypes.ndarray import CachedOp, _imperative_invoke -# pylint: enable=unused-import +from ..base import _LIB, numeric_types, integer_types +from ..base import c_array, mx_real_t +from ..base import mx_uint, NDArrayHandle, check_call +from ..base import ctypes2buffer +from ..context import Context +from . import _internal +from .op import NDArrayBase, _STORAGE_TYPE_ID_TO_STR +from . import broadcast_add, broadcast_mul, transpose, broadcast_not_equal, broadcast_power +from . import broadcast_sub, broadcast_div, broadcast_to, broadcast_equal, cast_storage +from . import broadcast_greater, broadcast_greater_equal, broadcast_lesser, broadcast_lesser_equal +from . import zeros_like, slice + +__all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP", + "ones", "add", "arange", "divide", "equal", "full", "greater", "greater_equal", + "imdecode", "lesser", "lesser_equal", "maximum", "minimum", "moveaxis", + "multiply", "negative", "not_equal", "onehot_encode", "power", "subtract", + "true_divide", "waitall", "_new_empty_handle"] # pylint: disable= no-member _DTYPE_NP_TO_MX = { @@ -74,7 +60,6 @@ np.int8 : 5, np.int64 : 6, } - _DTYPE_MX_TO_NP = { -1 : None, 0 : np.float32, @@ -85,7 +70,12 @@ 5 : np.int8, 6 : np.int64, } - +_STORAGE_TYPE_STR_TO_ID = { + 'undefined' : -1, + 'default' : 0, + 'row_sparse' : 1, + 'csr' : 2, +} _GRAD_REQ_MAP = { 'null': 0, 'write': 1, @@ -93,6 +83,7 @@ } # pylint: enable= no-member + def _new_empty_handle(): """Returns a new empty handle. @@ -107,6 +98,7 @@ def _new_empty_handle(): check_call(_LIB.MXNDArrayCreateNone(ctypes.byref(hdl))) return hdl + def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t): """Return a new handle with specified shape and context. @@ -128,6 +120,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t): ctypes.byref(hdl))) return hdl + def waitall(): """Wait for all async operations to finish in MXNet. @@ -135,6 +128,13 @@ def waitall(): """ check_call(_LIB.MXNDArrayWaitAll()) + +def _storage_type(handle): + storage_type = ctypes.c_int(0) + check_call(_LIB.MXNDArrayGetStorageType(handle, ctypes.byref(storage_type))) + return _STORAGE_TYPE_ID_TO_STR[storage_type.value] + + class NDArray(NDArrayBase): """An array object representing a multidimensional, homogeneous array of fixed-size items. @@ -144,6 +144,7 @@ class NDArray(NDArrayBase): # make numpy functions return NDArray instead of numpy object array __array_priority__ = 1000.0 # pylint: disable= no-member, undefined-variable + def __repr__(self): """Returns a string representation of the array.""" shape_info = 'x'.join(['%d' % x for x in self.shape]) @@ -151,6 +152,9 @@ def __repr__(self): self.__class__.__name__, shape_info, self.context) + def __reduce__(self): + return NDArray, (None,), self.__getstate__() + def __add__(self, other): """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """ return add(self, other) @@ -742,7 +746,6 @@ def wait_to_read(self): """ check_call(_LIB.MXNDArrayWaitToRead(self.handle)) - @property def ndim(self): """Returns the number of dimensions of this array @@ -777,6 +780,7 @@ def shape(self): self.handle, ctypes.byref(ndim), ctypes.byref(pdata))) return tuple(pdata[:ndim.value]) + @property def size(self): """Number of elements in the array. @@ -841,6 +845,12 @@ def dtype(self): self.handle, ctypes.byref(mx_dtype))) return _DTYPE_MX_TO_NP[mx_dtype.value] + @property + def stype(self): + """Storage-type of the array. + """ + return _storage_type(self.handle) + @property # pylint: disable= invalid-name, undefined-variable def T(self): @@ -964,7 +974,7 @@ def copyto(self, other): Returns ------- - NDArray + NDArray, CSRNDArray, RowSparseNDArray The copied array. If ``other`` is an ``NDArray``, then the return value and ``other`` will point to the same ``NDArray``. @@ -1101,6 +1111,20 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True): ctypes.c_int(retain_graph), ctypes.c_int(train_mode))) + def tostype(self, stype): + """Return a copy of the array with chosen storage type. + + See Also + ---------- + :meth:`mxnet.ndarray.cast_storage`. + + Returns + ------- + NDArray, CSRNDArray or RowSparseNDArray + A copy of the array with the chosen storage stype + """ + return cast_storage(self, stype=stype) + def onehot_encode(indices, out): """One-hot encoding indices into matrix out. @@ -1113,74 +1137,7 @@ def onehot_encode(indices, out): # pylint: enable= no-member, protected-access -def empty(shape, ctx=None, dtype=mx_real_t): - """Returns a new array of given shape and type, without initializing entries. - - Parameters - ---------- - shape : int or tuple of int - The shape of the empty array. - ctx : Context, optional - An optional device context (default is the current default context). - dtype : str or numpy.dtype, optional - An optional value type (default is `float32`). - - Returns - ------- - NDArray - A created array. - - Examples - -------- - >>> mx.nd.empty(1) - - >>> mx.nd.empty((1,2), mx.gpu(0)) - - >>> mx.nd.empty((1,2), mx.gpu(0), 'float16') - - """ - if isinstance(shape, integer_types): - shape = (shape, ) - if ctx is None: - ctx = Context.default_ctx - return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype)) - -def zeros(shape, ctx=None, dtype=mx_real_t, **kwargs): - """Returns a new array filled with all zeros, with the given shape and type. - - Parameters - ---------- - shape : int or tuple of int - The shape of the empty array. - ctx : Context, optional - An optional device context (default is the current default context). - dtype : str or numpy.dtype, optional - An optional value type (default is `float32`). - out : NDArray, optional - The output NDArray (default is `None`). - - Returns - ------- - NDArray - A created array - - Examples - -------- - >>> mx.nd.zeros(1).asnumpy() - array([ 0.], dtype=float32) - >>> mx.nd.zeros((1,2), mx.gpu(0)) - - >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy() - array([[ 0., 0.]], dtype=float16) - """ - # pylint: disable= unused-argument - if ctx is None: - ctx = Context.default_ctx - # pylint: disable= no-member, protected-access - return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs) - # pylint: enable= no-member, protected-access - -def ones(shape, ctx=None, dtype=mx_real_t, **kwargs): +def ones(shape, ctx=None, dtype=None, **kwargs): """Returns a new array filled with all ones, with the given shape and type. Parameters @@ -1212,10 +1169,12 @@ def ones(shape, ctx=None, dtype=mx_real_t, **kwargs): # pylint: disable= unused-argument if ctx is None: ctx = Context.default_ctx + dtype = mx_real_t if dtype is None else dtype # pylint: disable= no-member, protected-access return _internal._ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs) # pylint: enable= no-member, protected-access + def full(shape, val, ctx=None, dtype=mx_real_t, out=None): """Returns a new array of given shape and type, filled with the given value `val`. @@ -1269,18 +1228,6 @@ def array(source_array, ctx=None, dtype=None): ------- NDArray An `NDArray` with the same contents as the `source_array`. - - Examples - -------- - >>> import numpy as np - >>> mx.nd.array([1, 2, 3]) - - >>> mx.nd.array([[1, 2], [3, 4]]) - - >>> mx.nd.array(np.zeros((3, 2))) - - >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0)) - """ if isinstance(source_array, NDArray): dtype = source_array.dtype if dtype is None else dtype @@ -1382,6 +1329,7 @@ def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t): dtype=dtype, ctx=str(ctx)) # pylint: enable= no-member, protected-access, too-many-arguments + #pylint: disable= too-many-arguments, no-member, protected-access def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None): """ Helper function for element-wise operation. @@ -1430,6 +1378,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None): raise TypeError('type %s not supported' % str(type(rhs))) #pylint: enable= too-many-arguments, no-member, protected-access + def add(lhs, rhs): """Returns element-wise sum of the input arrays with broadcasting. @@ -1491,6 +1440,7 @@ def add(lhs, rhs): None) # pylint: enable= no-member, protected-access + def subtract(lhs, rhs): """Returns element-wise difference of the input arrays with broadcasting. @@ -1552,6 +1502,7 @@ def subtract(lhs, rhs): _internal._rminus_scalar) # pylint: enable= no-member, protected-access + def multiply(lhs, rhs): """Returns element-wise product of the input arrays with broadcasting. @@ -1612,6 +1563,7 @@ def multiply(lhs, rhs): None) # pylint: enable= no-member, protected-access + def divide(lhs, rhs): """Returns element-wise division of the input arrays with broadcasting. @@ -1668,6 +1620,7 @@ def divide(lhs, rhs): _internal._rdiv_scalar) # pylint: enable= no-member, protected-access + def modulo(lhs, rhs): """Returns element-wise modulo of the input arrays with broadcasting. @@ -1724,6 +1677,7 @@ def modulo(lhs, rhs): _internal._rmod_scalar) # pylint: enable= no-member, protected-access + def power(base, exp): """Returns result of first array elements raised to powers from second array, element-wise with broadcasting. @@ -1785,6 +1739,7 @@ def power(base, exp): _internal._rpower_scalar) # pylint: enable= no-member, protected-access + def maximum(lhs, rhs): """Returns element-wise maximum of the input arrays with broadcasting. @@ -1841,6 +1796,7 @@ def maximum(lhs, rhs): None) # pylint: enable= no-member, protected-access + def minimum(lhs, rhs): """Returns element-wise minimum of the input arrays with broadcasting. @@ -1897,6 +1853,7 @@ def minimum(lhs, rhs): None) # pylint: enable= no-member, protected-access + def equal(lhs, rhs): """Returns the result of element-wise **equal to** (==) comparison operation with broadcasting. @@ -1960,6 +1917,7 @@ def equal(lhs, rhs): None) # pylint: enable= no-member, protected-access + def not_equal(lhs, rhs): """Returns the result of element-wise **not equal to** (!=) comparison operation with broadcasting. @@ -2026,6 +1984,7 @@ def not_equal(lhs, rhs): None) # pylint: enable= no-member, protected-access + def greater(lhs, rhs): """Returns the result of element-wise **greater than** (>) comparison operation with broadcasting. @@ -2089,6 +2048,7 @@ def greater(lhs, rhs): _internal._lesser_scalar) # pylint: enable= no-member, protected-access + def greater_equal(lhs, rhs): """Returns the result of element-wise **greater than or equal to** (>=) comparison operation with broadcasting. @@ -2152,6 +2112,7 @@ def greater_equal(lhs, rhs): _internal._lesser_equal_scalar) # pylint: enable= no-member, protected-access + def lesser(lhs, rhs): """Returns the result of element-wise **lesser than** (<) comparison operation with broadcasting. @@ -2279,12 +2240,14 @@ def lesser_equal(lhs, rhs): _internal._greater_equal_scalar) # pylint: enable= no-member, protected-access + def true_divide(lhs, rhs): """This function is similar to :meth:`divide`. """ return divide(lhs, rhs) + def negative(arr): """Numerical negative, element-wise. @@ -2310,95 +2273,6 @@ def negative(arr): return multiply(arr, -1.0) -def load(fname): - """Loads an array from file. - - See more details in ``save``. - - Parameters - ---------- - fname : str - The filename. - - Returns - ------- - list of NDArray or dict of str to NDArray - Loaded data. - """ - if not isinstance(fname, string_types): - raise TypeError('fname required to be a string') - out_size = mx_uint() - out_name_size = mx_uint() - handles = ctypes.POINTER(NDArrayHandle)() - names = ctypes.POINTER(ctypes.c_char_p)() - check_call(_LIB.MXNDArrayLoad(c_str(fname), - ctypes.byref(out_size), - ctypes.byref(handles), - ctypes.byref(out_name_size), - ctypes.byref(names))) - if out_name_size.value == 0: - return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)] - else: - assert out_name_size.value == out_size.value - return dict( - (py_str(names[i]), NDArray(NDArrayHandle(handles[i]))) for i in range(out_size.value)) - - -def save(fname, data): - """Saves a list of arrays or a dict of str->array to file. - - Examples of filenames: - - - ``/path/to/file`` - - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports) - - ``hdfs://path/to/file`` (if compiled with HDFS supports) - - Parameters - ---------- - fname : str - The filename. - data : ``NDArray``, list of ``NDArray` or dict of str to ``NDArray`` - The data to save. - - Examples - -------- - >>> x = mx.nd.zeros((2,3)) - >>> y = mx.nd.ones((1,4)) - >>> mx.nd.save('my_list', [x,y]) - >>> mx.nd.save('my_dict', {'x':x, 'y':y}) - >>> mx.nd.load('my_list') - [, ] - >>> mx.nd.load('my_dict') - {'y': , 'x': } - """ - if isinstance(data, NDArray): - data = [data] - handles = [] - if isinstance(data, dict): - keys = [] - for key, val in data.items(): - if not isinstance(key, string_types): - raise TypeError('save only accept dict str->NDArray or list of NDArray') - if not isinstance(val, NDArray): - raise TypeError('save only accept dict str->NDArray or list of NDArray') - keys.append(c_str(key)) - handles.append(val.handle) - keys = c_array(ctypes.c_char_p, keys) - elif isinstance(data, list): - for val in data: - if not isinstance(val, NDArray): - raise TypeError('save only accept dict str->NDArray or list of NDArray') - handles.append(val.handle) - keys = None - else: - raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs " - "or a list of NDarrays.") - check_call(_LIB.MXNDArraySave(c_str(fname), - mx_uint(len(handles)), - c_array(NDArrayHandle, handles), - keys)) - - def concatenate(arrays, axis=0, always_copy=True): """DEPRECATED, use ``concat`` instead @@ -2455,6 +2329,7 @@ def concatenate(arrays, axis=0, always_copy=True): return ret + def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mean=None): """DEPRECATED, use mx.img instead @@ -2497,159 +2372,65 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea out=out) -# pylint: disable=too-many-locals, invalid-name -def _make_ndarray_function(handle, name): - """Create a NDArray function from the FunctionHandle.""" - real_name = ctypes.c_char_p() - desc = ctypes.c_char_p() - num_args = mx_uint() - arg_names = ctypes.POINTER(ctypes.c_char_p)() - arg_types = ctypes.POINTER(ctypes.c_char_p)() - arg_descs = ctypes.POINTER(ctypes.c_char_p)() - key_var_num_args = ctypes.c_char_p() - ret_type = ctypes.c_char_p() - - check_call(_LIB.MXSymbolGetAtomicSymbolInfo( - handle, ctypes.byref(real_name), ctypes.byref(desc), - ctypes.byref(num_args), - ctypes.byref(arg_names), - ctypes.byref(arg_types), - ctypes.byref(arg_descs), - ctypes.byref(key_var_num_args), - ctypes.byref(ret_type))) - narg = int(num_args.value) - arg_names = [py_str(arg_names[i]) for i in range(narg)] - arg_types = [py_str(arg_types[i]) for i in range(narg)] - func_name = name - key_var_num_args = py_str(key_var_num_args.value) - ret_type = py_str(ret_type.value) if ret_type.value is not None else '' - doc_str = _build_doc(func_name, - py_str(desc.value), - arg_names, - arg_types, - [py_str(arg_descs[i]) for i in range(narg)], - key_var_num_args, - ret_type) - - dtype_name = None - arr_name = None - ndsignature = [] - signature = [] - ndarg_names = [] - kwarg_names = [] - for i in range(narg): - name, atype = arg_names[i], arg_types[i] - if name == 'dtype': - dtype_name = name - signature.append('%s=_Null'%name) - elif atype.startswith('NDArray') or atype.startswith('Symbol'): - assert not arr_name, \ - "Op can only have one argument with variable " \ - "size and it must be the last argument." - if atype.endswith('[]'): - ndsignature.append('*%s'%name) - arr_name = name - else: - ndsignature.append('%s=None'%name) - ndarg_names.append(name) - else: - signature.append('%s=_Null'%name) - kwarg_names.append(name) - signature.append('out=None') - signature.append('name=None') - signature.append('**kwargs') - signature = ndsignature + signature - - code = [] - if arr_name: - code.append(""" -def %s(*%s, **kwargs):"""%(func_name, arr_name)) - code.append(""" - ndargs = [] - for i in {}: - assert isinstance(i, NDArrayBase), \\ - "Positional arguments must have NDArray type, " \\ - "but got %s"%str(i) - ndargs.append(i)""".format(arr_name)) - if dtype_name is not None: - code.append(""" - if '%s' in kwargs: - kwargs['%s'] = np.dtype(kwargs['%s']).name"""%( - dtype_name, dtype_name, dtype_name)) - code.append(""" - _ = kwargs.pop('name', None) - out = kwargs.pop('out', None) - keys = list(kwargs.keys()) - vals = list(kwargs.values())""") - else: - code.append(""" -def %s(%s): - ndargs = [] - keys = list(kwargs.keys()) - vals = list(kwargs.values())"""%(func_name, ', '.join(signature))) - # NDArray args - for name in ndarg_names: # pylint: disable=redefined-argument-from-local - code.append(""" - if {name} is not None: - assert isinstance({name}, NDArrayBase), \\ - "Argument {name} must have NDArray type, but got %s"%str({name}) - ndargs.append({name})""".format(name=name)) - # kwargs - for name in kwarg_names: # pylint: disable=redefined-argument-from-local - code.append(""" - if %s is not _Null: - keys.append('%s') - vals.append(%s)"""%(name, name, name)) - # dtype - if dtype_name is not None: - code.append(""" - if %s is not _Null: - keys.append('%s') - vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name)) - - code.append(""" - return _imperative_invoke(%d, ndargs, keys, vals, out)"""%( - handle.value)) - - local = {} - exec(''.join(code), None, local) # pylint: disable=exec-used - ndarray_function = local[func_name] - ndarray_function.__name__ = func_name - ndarray_function.__doc__ = doc_str - ndarray_function.__module__ = 'mxnet.ndarray' - return ndarray_function - - -# pylint: enable=too-many-locals, invalid-name -def _init_ndarray_module(ndarray_class, root_namespace): - """List and add all the ndarray functions to current module.""" - _set_ndarray_class(ndarray_class) - plist = ctypes.POINTER(ctypes.c_char_p)() - size = ctypes.c_uint() - - check_call(_LIB.MXListAllOpNames(ctypes.byref(size), - ctypes.byref(plist))) - op_names = [] - for i in range(size.value): - op_names.append(py_str(plist[i])) - - module_obj = _sys.modules["%s.ndarray" % root_namespace] - module_internal = _sys.modules["%s._ndarray_internal" % root_namespace] - module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace] - for name in op_names: - hdl = OpHandle() - check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl))) - function = _make_ndarray_function(hdl, name) - if function.__name__.startswith('_contrib_'): - function.__name__ = function.__name__[9:] - function.__module__ = 'mxnet.contrib.ndarray' - setattr(module_contrib, function.__name__, function) - elif function.__name__.startswith('_'): - setattr(module_internal, function.__name__, function) - else: - setattr(module_obj, function.__name__, function) +def zeros(shape, ctx=None, dtype=None, **kwargs): + """Returns a new array filled with all zeros, with the given shape and type. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array. + ctx : Context, optional + An optional device context (default is the current default context). + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`). + out : NDArray, optional + The output NDArray (default is `None`). + + Returns + ------- + NDArray + A created array + + Examples + -------- + >>> mx.nd.zeros(1).asnumpy() + array([ 0.], dtype=float32) + >>> mx.nd.zeros((1,2), mx.gpu(0)) + + >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy() + array([[ 0., 0.]], dtype=float16) + """ + # pylint: disable= unused-argument + if ctx is None: + ctx = Context.default_ctx + dtype = mx_real_t if dtype is None else dtype + # pylint: disable= no-member, protected-access + return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs) + # pylint: enable= no-member, protected-access + + +def empty(shape, ctx=None, dtype=None): + """Returns a new array of given shape and type, without initializing entries. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array. + ctx : Context, optional + An optional device context (default is the current default context). + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`). -_init_ndarray_module(NDArray, "mxnet") + Returns + ------- + NDArray + A created array. -# from .base import add_fileline_to_docstring -# add_fileline_to_docstring(__name__) + """ + if isinstance(shape, int): + shape = (shape, ) + if ctx is None: + ctx = Context.default_ctx + if dtype is None: + dtype = mx_real_t + return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype)) diff --git a/python/mxnet/ndarray/op.py b/python/mxnet/ndarray/op.py new file mode 100644 index 000000000000..e4a1ab0df48b --- /dev/null +++ b/python/mxnet/ndarray/op.py @@ -0,0 +1,209 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Register backend ops in mxnet.ndarray namespace""" + +import sys as _sys +import os as _os +import ctypes +import numpy as np # pylint: disable=unused-import + +from ..ndarray_doc import _build_doc + +# Use different version of SymbolBase +# When possible, use cython to speedup part of computation. +# pylint: disable=unused-import +try: + if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: + from .._ctypes.ndarray import NDArrayBase, _STORAGE_TYPE_ID_TO_STR + from .._ctypes.ndarray import CachedOp, _imperative_invoke + elif _sys.version_info >= (3, 0): + from .._cy3.ndarray import NDArrayBase, _imperative_invoke, _STORAGE_TYPE_ID_TO_STR + from .._cy3.ndarray import CachedOp, _imperative_invoke + else: + from .._cy2.ndarray import NDArrayBase, _imperative_invoke, _STORAGE_TYPE_ID_TO_STR + from .._cy2.ndarray import CachedOp, _imperative_invoke +except ImportError: + if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: + raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") + from .._ctypes.ndarray import NDArrayBase, _imperative_invoke, _STORAGE_TYPE_ID_TO_STR + from .._ctypes.ndarray import CachedOp, _imperative_invoke + +from ..base import mx_uint, check_call, _LIB, py_str, OpHandle, c_str, _Null +# pylint: enable=unused-import + + +# pylint: disable=too-many-locals, invalid-name +def _make_ndarray_function(handle, name): + """Create a NDArray function from the FunctionHandle.""" + real_name = ctypes.c_char_p() + desc = ctypes.c_char_p() + num_args = mx_uint() + arg_names = ctypes.POINTER(ctypes.c_char_p)() + arg_types = ctypes.POINTER(ctypes.c_char_p)() + arg_descs = ctypes.POINTER(ctypes.c_char_p)() + key_var_num_args = ctypes.c_char_p() + ret_type = ctypes.c_char_p() + + check_call(_LIB.MXSymbolGetAtomicSymbolInfo( + handle, ctypes.byref(real_name), ctypes.byref(desc), + ctypes.byref(num_args), + ctypes.byref(arg_names), + ctypes.byref(arg_types), + ctypes.byref(arg_descs), + ctypes.byref(key_var_num_args), + ctypes.byref(ret_type))) + narg = int(num_args.value) + arg_names = [py_str(arg_names[i]) for i in range(narg)] + arg_types = [py_str(arg_types[i]) for i in range(narg)] + func_name = name + key_var_num_args = py_str(key_var_num_args.value) + ret_type = py_str(ret_type.value) if ret_type.value is not None else '' + doc_str = _build_doc(func_name, + py_str(desc.value), + arg_names, + arg_types, + [py_str(arg_descs[i]) for i in range(narg)], + key_var_num_args, + ret_type) + + dtype_name = None + arr_name = None + ndsignature = [] + signature = [] + ndarg_names = [] + kwarg_names = [] + for i in range(narg): + name, atype = arg_names[i], arg_types[i] + if name == 'dtype': + dtype_name = name + signature.append('%s=_Null'%name) + elif atype.startswith('NDArray') or atype.startswith('Symbol'): + assert not arr_name, \ + "Op can only have one argument with variable " \ + "size and it must be the last argument." + if atype.endswith('[]'): + ndsignature.append('*%s'%name) + arr_name = name + else: + ndsignature.append('%s=None'%name) + ndarg_names.append(name) + else: + signature.append('%s=_Null'%name) + kwarg_names.append(name) + signature.append('out=None') + signature.append('name=None') + signature.append('**kwargs') + signature = ndsignature + signature + + code = [] + if arr_name: + code.append(""" +def %s(*%s, **kwargs):"""%(func_name, arr_name)) + code.append(""" + ndargs = [] + for i in {}: + assert isinstance(i, NDArrayBase), \\ + "Positional arguments must have NDArray type, " \\ + "but got %s"%str(i) + ndargs.append(i)""".format(arr_name)) + if dtype_name is not None: + code.append(""" + if '%s' in kwargs: + kwargs['%s'] = np.dtype(kwargs['%s']).name"""%( + dtype_name, dtype_name, dtype_name)) + code.append(""" + _ = kwargs.pop('name', None) + out = kwargs.pop('out', None) + keys = list(kwargs.keys()) + vals = list(kwargs.values())""") + else: + code.append(""" +def %s(%s): + ndargs = [] + keys = list(kwargs.keys()) + vals = list(kwargs.values())"""%(func_name, ', '.join(signature))) + # NDArray args + for name in ndarg_names: # pylint: disable=redefined-argument-from-local + code.append(""" + if {name} is not None: + assert isinstance({name}, NDArrayBase), \\ + "Argument {name} must have NDArray type, but got %s"%str({name}) + ndargs.append({name})""".format(name=name)) + # kwargs + for name in kwarg_names: # pylint: disable=redefined-argument-from-local + code.append(""" + if %s is not _Null: + keys.append('%s') + vals.append(%s)"""%(name, name, name)) + # dtype + if dtype_name is not None: + code.append(""" + if %s is not _Null: + keys.append('%s') + vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name)) + + code.append(""" + return _imperative_invoke(%d, ndargs, keys, vals, out)"""%( + handle.value)) + + local = {} + exec(''.join(code), None, local) # pylint: disable=exec-used + ndarray_function = local[func_name] + ndarray_function.__name__ = func_name + ndarray_function.__doc__ = doc_str + ndarray_function.__module__ = 'mxnet.ndarray' + return ndarray_function + + +# pylint: enable=too-many-locals, invalid-name +def _init_ndarray_module(root_namespace): + """List and add all the ndarray functions to current module.""" + plist = ctypes.POINTER(ctypes.c_char_p)() + size = ctypes.c_uint() + + check_call(_LIB.MXListAllOpNames(ctypes.byref(size), + ctypes.byref(plist))) + op_names = [] + for i in range(size.value): + op_names.append(py_str(plist[i])) + + module_obj = _sys.modules["%s.ndarray" % root_namespace] + module_sparse = _sys.modules["%s.ndarray.sparse" % root_namespace] + module_internal = _sys.modules["%s.ndarray._internal" % root_namespace] + module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace] + for name in op_names: + hdl = OpHandle() + check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl))) + function = _make_ndarray_function(hdl, name) + if function.__name__.startswith('_contrib_'): + function.__name__ = function.__name__[9:] + function.__module__ = 'mxnet.contrib.ndarray' + setattr(module_contrib, function.__name__, function) + elif function.__name__.startswith('_'): + setattr(module_internal, function.__name__, function) + else: + setattr(module_obj, function.__name__, function) + + # register sparse ops under mxnet.ndarray.sparse + if function.__name__.startswith('_sparse_'): + function.__name__ = function.__name__[8:] + function.__module__ = 'mxnet.ndarray.sparse' + setattr(module_sparse, function.__name__, function) + +# register backend operators in mx.nd +_init_ndarray_module("mxnet") diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py new file mode 100644 index 000000000000..97e43f5ebe79 --- /dev/null +++ b/python/mxnet/ndarray/sparse.py @@ -0,0 +1,923 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +"""Sparse NDArray API of MXNet.""" + +from __future__ import absolute_import +from __future__ import division +try: + from __builtin__ import slice as py_slice +except ImportError: + from builtins import slice as py_slice + +import ctypes +import warnings + +import os as _os +import sys as _sys + +# import operator +import numpy as np +from ..base import NotSupportedForSparseNDArray +from ..base import _LIB, numeric_types +from ..base import c_array, mx_real_t +from ..base import mx_uint, NDArrayHandle, check_call +from ..context import Context +from . import _internal +from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP +from .ndarray import _STORAGE_TYPE_STR_TO_ID +from .ndarray import NDArray, _storage_type +from .ndarray import zeros as _zeros_ndarray +from .ndarray import array as _array +from . import cast_storage +from . import slice as nd_slice + +# Use different verison of SymbolBase +# When possible, use cython to speedup part of computation. +# pylint: disable=unused-import +try: + if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: + from .._ctypes.ndarray import _set_ndarray_class + elif _sys.version_info >= (3, 0): + from .._cy3.ndarray import _set_ndarray_class + else: + from .._cy2.ndarray import _set_ndarray_class +except ImportError: + if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: + raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") + from .._ctypes.ndarray import _set_ndarray_class +# pylint: enable=unused-import + + +__all__ = ["_ndarray_cls", "csr_matrix", "row_sparse_array", + "BaseSparseNDArray", "CSRNDArray", "RowSparseNDArray"] + + +_STORAGE_AUX_TYPES = { + 'row_sparse': [np.int64], + 'csr': [np.int64, np.int64] +} + + +def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shapes=None): + """Return a new handle with specified storage type, shape, dtype and context. + + Empty handle is only used to hold results + + Returns + ------- + handle + A new empty ndarray handle + """ + hdl = NDArrayHandle() + aux_type_ids = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types] + aux_shapes = [(0,) for aux_t in aux_types] if aux_shapes is None else aux_shapes + aux_shape_lens = [len(aux_shape) for aux_shape in aux_shapes] + aux_shapes = sum(aux_shapes, ()) + num_aux = mx_uint(len(aux_types)) + check_call(_LIB.MXNDArrayCreateSparseEx( + ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])), + c_array(mx_uint, shape), + mx_uint(len(shape)), + ctypes.c_int(ctx.device_typeid), + ctypes.c_int(ctx.device_id), + ctypes.c_int(int(delay_alloc)), + ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])), + num_aux, + c_array(ctypes.c_int, aux_type_ids), + c_array(mx_uint, aux_shape_lens), + c_array(mx_uint, aux_shapes), + ctypes.byref(hdl))) + return hdl + + +class BaseSparseNDArray(NDArray): + """The base class of an NDArray stored in a sparse storage format. + + See CSRNDArray and RowSparseNDArray for more details. + """ + + def __iadd__(self, other): + raise NotImplementedError() + + def __isub__(self, other): + raise NotImplementedError() + + def __imul__(self, other): + raise NotImplementedError() + + def __idiv__(self, other): + raise NotImplementedError() + + def __itruediv__(self, other): + raise NotImplementedError() + + def _sync_copyfrom(self, source_array): + raise NotImplementedError() + + def _at(self, idx): + raise NotSupportedForSparseNDArray(self._at, '[idx]', idx) + + def _slice(self, start, stop): + raise NotSupportedForSparseNDArray(self._slice, None, start, stop) + + def reshape(self, shape): + raise NotSupportedForSparseNDArray(self.reshape, None, shape) + + def _aux_type(self, i): + """Data-type of the array's ith aux data. + + Returns + ------- + numpy.dtype + This BaseSparseNDArray's aux data type. + """ + aux_type = ctypes.c_int() + check_call(_LIB.MXNDArrayGetAuxType(self.handle, i, ctypes.byref(aux_type))) + return _DTYPE_MX_TO_NP[aux_type.value] + + @property + def _num_aux(self): + """The number of aux data used to help store the sparse ndarray. + """ + return len(_STORAGE_AUX_TYPES[self.stype]) + + @property + def _aux_types(self): + """The data types of the aux data for the BaseSparseNDArray. + """ + aux_types = [] + num_aux = self._num_aux + for i in range(num_aux): + aux_types.append(self._aux_type(i)) + return aux_types + + def asnumpy(self): + """Return a dense ``numpy.ndarray`` object with value copied from this array + """ + return self.tostype('default').asnumpy() + + def astype(self, dtype): + """Returns a copy of the array after casting to a specified type. + Parameters + ---------- + dtype : numpy.dtype or str + The type of the returned array. + Examples + -------- + >>> x = mx.nd.zeros('row_sparse', (2,3), dtype='float32') + >>> y = x.astype('int32') + >>> y.dtype + + """ + res = zeros(shape=self.shape, ctx=self.context, + dtype=dtype, stype=self.stype) + self.copyto(res) + return res + + def copyto(self, other): + """Copies the value of this array to another array. + + Parameters + ---------- + other : NDArray or CSRNDArray or RowSparseNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or CSRNDArray or RowSparseNDArray + The copied array. + """ + if isinstance(other, NDArray): + if other.handle is self.handle: + warnings.warn('You are attempting to copy an array to itself', RuntimeWarning) + return + return _internal._copyto(self, out=other) + elif isinstance(other, Context): + hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other, + True, self.dtype, self._aux_types)) + return _internal._copyto(self, out=hret) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + + def _data(self): + """A deep copy NDArray of the data array associated with the BaseSparseNDArray. + + This function blocks. Do not use it in performance critical code. + """ + self.wait_to_read() + hdl = NDArrayHandle() + check_call(_LIB.MXNDArrayGetDataNDArray(self.handle, ctypes.byref(hdl))) + return NDArray(hdl) + + + def _aux_data(self, i): + """ Get a deep copy NDArray of the i-th aux data array associated with the + BaseSparseNDArray. + + This function blocks. Do not use it in performance critical code. + """ + self.wait_to_read() + hdl = NDArrayHandle() + check_call(_LIB.MXNDArrayGetAuxNDArray(self.handle, i, ctypes.byref(hdl))) + return NDArray(hdl) + + +# pylint: disable=abstract-method +class CSRNDArray(BaseSparseNDArray): + """A sparse representation of 2D NDArray in the standard CSR format. + + A CSRNDArray represents an NDArray as three separate arrays: `data`, + `indptr` and `indices`. It uses the standard CSR representation where the column indices for + row i are stored in indices[indptr[i]:indptr[i+1]] and their corresponding values are stored + in values[indptr[i]:indptr[i+1]]. + + The column indices for a given row are expected to be sorted in ascending order. + Duplicate column entries for the same row are not allowed. + + Example + ------- + >>> a = mx.nd.array([[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 0, 3]]) + >>> a = a.tostype('csr') + >>> a.indices.asnumpy() + array([1, 0, 2]) + >>> a.indptr.asnumpy() + array([0, 1, 2, 2, 3]) + >>> a.data.asnumpy() + array([ 1., 2., 3.], dtype=float32) + """ + + def __reduce__(self): + return CSRNDArray, (None,), super(CSRNDArray, self).__getstate__() + + def __iadd__(self, other): + (self + other).copyto(self) + return self + + def __isub__(self, other): + (self - other).copyto(self) + return self + + def __imul__(self, other): + (self * other).copyto(self) + return self + + def __idiv__(self, other): + (self / other).copyto(self) + return self + + def __itruediv__(self, other): + (self / other).copyto(self) + return self + + def __getitem__(self, key): + """x.__getitem__(i) <=> x[i] + + Returns a sliced view of this array. + + Parameters + ---------- + key : slice + Indexing key. + + Examples + -------- + >>> indptr = np.array([0, 2, 3, 6]) + >>> indices = np.array([0, 2, 2, 0, 1, 2]) + >>> data = np.array([1, 2, 3, 4, 5, 6]) + >>> a = mx.nd.csr_matrix(data, indptr, indices, (3, 3)) + >>> a.asnumpy() + array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]) + >>> a[1:2].asnumpy() + array([[0, 0, 3]], dtype=float32) + """ + if isinstance(key, int): + raise ValueError("__getitem__ with int key is not implemented for CSRNDArray") + if isinstance(key, py_slice): + if key.step is not None: + raise ValueError('CSRNDArray only supports continuous slicing on axis 0') + if key.start is not None or key.stop is not None: + begin = key.start if key.start else 0 + end = key.stop if key.stop else self.shape[0] + return nd_slice(self, begin=begin, end=end) + else: + return self + if isinstance(key, tuple): + raise ValueError('Multi-dimension indexing is not supported') + + def __setitem__(self, key, value): + """x.__setitem__(i, y) <=> x[i]=y + + Set self[key] to value. Only slice key [:] is supported. + + Parameters + ---------- + key : slice + The indexing key. + value : NDArray or CSRNDArray or numpy.ndarray + The value to set. + + Examples + -------- + >>> src = mx.nd.zeros((3,3), stype='csr') + >>> src.asnumpy() + array([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]], dtype=float32) + >>> # assign CSRNDArray with same storage type + >>> x = mx.nd.ones('row_sparse', (3,3)).tostype('csr') + >>> x[:] = src + >>> x.asnumpy() + array([[ 1., 1., 1.], + [ 1., 1., 1.], + [ 1., 1., 1.]], dtype=float32) + >>> # assign NDArray to CSRNDArray + >>> x[:] = mx.nd.ones((3,3)) * 2 + >>> x.asnumpy() + array([[ 2., 2., 2.], + [ 2., 2., 2.], + [ 2., 2., 2.]], dtype=float32) + """ + if not self.writable: + raise ValueError('Failed to assign to a readonly CSRNDArray') + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise ValueError('Assignment with slice for CSRNDArray is not ' \ + 'implmented yet.') + if isinstance(value, NDArray): + # avoid copying to itself + if value.handle is not self.handle: + value.copyto(self) + elif isinstance(value, numeric_types): + raise ValueError("Assigning numeric types to CSRNDArray is " \ + "not implemented yet.") + elif isinstance(value, (np.ndarray, np.generic)): + # TODO(haibin/anisub) check scipy.sparse and use _sync_copy_from to + # avoid the temporary copy + warnings.warn('Assigning non-NDArray object to CSRNDArray is not efficient', + RuntimeWarning) + tmp = _array(value) + tmp.copyto(self) + else: + raise TypeError('type %s not supported' % str(type(value))) + else: + assert(isinstance(key, (int, tuple))) + raise Exception('CSRNDArray only supports [:] for assignment') + + @property + def indices(self): + """A deep copy NDArray of the indices array of the CSRNDArray. + This generates a deep copy of the column indices of the current `csr` matrix. + + Returns + ------- + NDArray + This CSRNDArray's indices array. + """ + return self._aux_data(1) + + @property + def indptr(self): + """A deep copy NDArray of the indptr array of the CSRNDArray. + This generates a deep copy of the `indptr` of the current `csr` matrix. + + Returns + ------- + NDArray + This CSRNDArray's indptr array. + """ + return self._aux_data(0) + + @property + def data(self): + """A deep copy NDArray of the data array of the CSRNDArray. + This generates a deep copy of the `data` of the current `csr` matrix. + + Returns + ------- + NDArray + This CSRNDArray's data array. + """ + return self._data() + + def tostype(self, stype): + """Return a copy of the array with chosen storage type. + + Returns + ------- + NDArray or CSRNDArray + A copy of the array with the chosen storage stype + """ + if stype == 'row_sparse': + raise ValueError("cast_storage from csr to row_sparse is not supported") + return cast_storage(self, stype=stype) + + def copyto(self, other): + """Copies the value of this array to another array. + + If ``other`` is a ``NDArray`` or ``CSRNDArray`` object, then ``other.shape`` and + ``self.shape`` should be the same. This function copies the value from + ``self`` to ``other``. + + If ``other`` is a context, a new ``CSRNDArray`` will be first created on + the target context, and the value of ``self`` is copied. + + Parameters + ---------- + other : NDArray or CSRNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or CSRNDArray + The copied array. If ``other`` is an ``NDArray`` or ``CSRNDArray``, then the return + value and ``other`` will point to the same ``NDArray`` or ``CSRNDArray``. + """ + if isinstance(other, Context): + return super(CSRNDArray, self).copyto(other) + elif isinstance(other, NDArray): + stype = other.stype + if stype == 'default' or stype == 'csr': + return super(CSRNDArray, self).copyto(other) + else: + raise TypeError('copyto does not support destination NDArray stype ' + str(stype)) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + + +# pylint: disable=abstract-method +class RowSparseNDArray(BaseSparseNDArray): + """A sparse representation of a set of NDArray row slices at given indices. + + A RowSparseNDArray represents a multidimensional NDArray using two separate arrays: `data` and + `indices`. + + - data: an NDArray of any dtype with shape [D0, D1, ..., Dn]. + - indices: a 1-D int64 NDArray with shape [D0]. + + The `indices` stores the indices of the row slices with non-zeros, + while the values are stored in `data`. The corresponding NDArray ``dense`` + represented by RowSparseNDArray ``rsp`` has + + ``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]`` + + >>> dense.asnumpy() + array([[ 1., 2., 3.], + [ 0., 0., 0.], + [ 4., 0., 5.], + [ 0., 0., 0.], + [ 0., 0., 0.]], dtype=float32) + >>> rsp = dense.tostype('row_sparse') + >>> rsp.indices.asnumpy() + array([0, 2], dtype=int64) + >>> rsp.data.asnumpy() + array([[ 1., 2., 3.], + [ 4., 0., 5.]], dtype=float32) + + A RowSparseNDArray is typically used to represent non-zero row-slices of a large NDArray + of shape [LARGE0, D1, .. , Dn] where LARGE0 >> D0 and most row slices are zeros. + + The indices are expected to be sorted in ascending order. + + RowSparseNDArray is used principally in the definition of gradients for operations + that have sparse gradients (e.g. sparse dot and sparse embedding). + """ + def __reduce__(self): + return RowSparseNDArray, (None,), super(RowSparseNDArray, self).__getstate__() + + def __iadd__(self, other): + (self + other).copyto(self) + return self + + def __isub__(self, other): + (self - other).copyto(self) + return self + + def __imul__(self, other): + (self * other).copyto(self) + return self + + def __idiv__(self, other): + (self / other).copyto(self) + return self + + def __itruediv__(self, other): + (self / other).copyto(self) + return self + + def __getitem__(self, key): + """x.__getitem__(i) <=> x[i] + + Returns a sliced view of this array. + + Parameters + ---------- + key : slice + Indexing key. + + Examples + -------- + >>> x = mx.nd.zeros((2, 3), stype='row_sparse') + >>> x[:].asnumpy() + array([[ 0., 0., 0.], + [ 0., 0., 0.]], dtype=float32) + """ + if isinstance(key, int): + raise Exception("__getitem__ with int key is not implemented for RowSparseNDArray yet") + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise Exception('RowSparseNDArray only supports [:] for __getitem__') + else: + return self + if isinstance(key, tuple): + raise ValueError('Multi-dimension indexing is not supported') + + def __setitem__(self, key, value): + """x.__setitem__(i, y) <=> x[i]=y + + Set self[key] to value. Only slice key [:] is supported. + + Parameters + ---------- + key : slice + The indexing key. + value : NDArray or numpy.ndarray + The value to set. + + Examples + -------- + >>> src = mx.nd.row_sparse([[1, 0, 2], [4, 5, 6]], [0, 2], (3,3)) + >>> src.asnumpy() + array([[ 1., 0., 2.], + [ 0., 0., 0.], + [ 4., 5., 6.]], dtype=float32) + >>> # assign RowSparseNDArray with same storage type + >>> x = mx.nd.zeros('row_sparse', (3,3)) + >>> x[:] = src + >>> x.asnumpy() + array([[ 1., 0., 2.], + [ 0., 0., 0.], + [ 4., 5., 6.]], dtype=float32) + >>> # assign NDArray to RowSparseNDArray + >>> x[:] = mx.nd.ones((3,3)) + >>> x.asnumpy() + array([[ 1., 1., 1.], + [ 1., 1., 1.], + [ 1., 1., 1.]], dtype=float32) + """ + if not self.writable: + raise ValueError('Failed to assign to a readonly RowSparseNDArray') + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise ValueError('Assignment with slice for RowSparseNDArray ' \ + 'is not implmented yet.') + if isinstance(value, NDArray): + # avoid copying to itself + if value.handle is not self.handle: + value.copyto(self) + elif isinstance(value, numeric_types): + raise ValueError("Assigning numeric types to RowSparseNDArray " \ + "is not implemented yet.") + elif isinstance(value, (np.ndarray, np.generic)): + warnings.warn('Assigning non-NDArray object to RowSparseNDArray is not efficient', + RuntimeWarning) + tmp = _array(value) + tmp.copyto(self) + else: + raise TypeError('type %s not supported' % str(type(value))) + else: + assert(isinstance(key, (int, tuple))) + raise TypeError('RowSparseNDArray only supports [:] for assignment') + + @property + def indices(self): + """A deep copy NDArray of the indices array of the RowSparseNDArray. + This generates a deep copy of the row indices of the current `row_sparse` matrix. + + Returns + ------- + NDArray + This RowSparseNDArray's indices array. + """ + return self._aux_data(0) + + @property + def data(self): + """A deep copy NDArray of the data array of the RowSparseNDArray. + This generates a deep copy of the `data` of the current `row_sparse` matrix. + + Returns + ------- + NDArray + This RowSparseNDArray's data array. + """ + return self._data() + + def tostype(self, stype): + """Return a copy of the array with chosen storage type. + + Returns + ------- + NDArray or RowSparseNDArray + A copy of the array with the chosen storage stype + """ + if stype == 'csr': + raise ValueError("cast_storage from row_sparse to csr is not supported") + return cast_storage(self, stype=stype) + + def copyto(self, other): + """Copies the value of this array to another array. + + If ``other`` is a ``NDArray`` or ``RowSparseNDArray`` object, then ``other.shape`` + and ``self.shape`` should be the same. This function copies the value from + ``self`` to ``other``. + + If ``other`` is a context, a new ``RowSparseNDArray`` will be first created on + the target context, and the value of ``self`` is copied. + + Parameters + ---------- + other : NDArray or RowSparseNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or RowSparseNDArray + The copied array. If ``other`` is an ``NDArray`` or ``RowSparseNDArray``, then the + return value and ``other`` will point to the same ``NDArray`` or ``RowSparseNDArray``. + """ + if isinstance(other, Context): + return super(RowSparseNDArray, self).copyto(other) + elif isinstance(other, NDArray): + stype = other.stype + if stype == 'default' or stype == 'row_sparse': + return super(RowSparseNDArray, self).copyto(other) + else: + raise TypeError('copyto does not support destination NDArray stype ' + str(stype)) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + + +def _prepare_src_array(src, dtype, default_dtype): + """Prepare `src` and its dtype so that they can be used to construct NDArray. + `src` is converted to a `np.ndarray` if it's neither an `NDArray` nor an `np.ndarray`. + """ + if isinstance(src, NDArray): + dtype = src.dtype if dtype is None else dtype + else: + dtype = default_dtype if dtype is None else dtype + if not isinstance(src, np.ndarray): + try: + src = np.array(src, dtype=dtype) + except: + raise TypeError('values must be array like object') + return src, dtype + + +def csr_matrix(data, indptr, indices, shape, ctx=None, dtype=None, indptr_type=None, + indices_type=None): + """Creates a 2D array with compressed sparse row(CSR) format. + + Parameters + ---------- + data: array_like + An object exposing the array interface, with shape [nnz], where D0 is the number of + non-zero entries. + indptr: array_like + An object exposing the array interface, with shape [D0 + 1]. The first element in indptr + should always be zero. + indices: array_like + An object exposing the array interface, with shape [nnz]. + ctx: Context, optional + Device context (default is the current default context). + dtype: str or numpy.dtype, optional + The data type of the output array. The default dtype is ``values.dtype`` + if `values` is an `NDArray`, `float32` otherwise. + indptr_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indptr.dtype`` + if `indptr` is an `NDArray`, `int64` otherwise. + indices_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indices.dtype`` + if `indicies` is an `NDArray`, `int64` otherwise. + + Returns + ------- + CSRNDArray + A `CSRNDArray` with the `csr` storage representation. + + Example + ------- + >>> import mxnet as mx + >>> a = mx.nd.csr_matrix([1, 2, 3], [0, 1, 2, 2, 3], [1, 0, 2], (4, 3)) + >>> a.asnumpy() + array([[ 0., 1., 0.], + [ 2., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 3.]], dtype=float32) + """ + storage_type = 'csr' + # context + if ctx is None: + ctx = Context.default_ctx + # prepare src array and types + data, dtype = _prepare_src_array(data, dtype, mx_real_t) + indptr, indptr_type = _prepare_src_array(indptr, indptr_type, + _STORAGE_AUX_TYPES[storage_type][0]) + indices, indices_type = _prepare_src_array(indices, indices_type, + _STORAGE_AUX_TYPES[storage_type][1]) + # verify types + assert('int64' in str(indptr_type)), "expected int64 for indptr" + assert('int64' in str(indices_type)), "expected int64 for indices" + # verify shapes + aux_shapes = [indptr.shape, indices.shape] + assert(data.ndim == 1) + assert(indptr.ndim == 1) + assert(indices.ndim == 1) + assert(len(shape) == 2) + result = CSRNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype, + [indptr_type, indices_type], aux_shapes)) + # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays + # if they are not for now. In the future, we should provide a c-api + # to accept np.ndarray types to copy from to result.data and aux_data + if not isinstance(data, NDArray): + data = _array(data, ctx, dtype) + if not isinstance(indptr, NDArray): + indptr = _array(indptr, ctx, indptr_type) + if not isinstance(indices, NDArray): + indices = _array(indices, ctx, indices_type) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1))) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indptr.handle, ctypes.c_int(0))) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(1))) + return result + + +def row_sparse_array(data, indices, shape, ctx=None, dtype=None, indices_type=None): + """Creates a multidimensional row sparse array with a set of tensor slices at given indices. + + Parameters + ---------- + data: array_like + An object exposing the array interface, with shape [D0, D1, .. DK], where D0 is + the number of rows with non-zeros entries. + indices: array_like + An object exposing the array interface, with shape [D0]. + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``data.dtype`` + if `data` is an `NDArray`, `float32` otherwise. + indices_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indices.dtype`` + if `indicies` is an `NDArray`, `int64` otherwise. + + Returns + ------- + RowSparseNDArray + An `RowSparseNDArray` with the `row_sparse` storage representation. + + Example + ------- + >>> a = mx.nd.row_sparse_array([[1, 2], [3, 4]], [1, 4], (6, 2)) + >>> a.asnumpy() + array([[ 0., 0.], + [ 1., 2.], + [ 0., 0.], + [ 0., 0.], + [ 3., 4.], + [ 0., 0.]], dtype=float32) + """ + storage_type = 'row_sparse' + # context + if ctx is None: + ctx = Context.default_ctx + # prepare src array and types + data, dtype = _prepare_src_array(data, dtype, mx_real_t) + indices, indices_type = _prepare_src_array(indices, indices_type, + _STORAGE_AUX_TYPES[storage_type][0]) + # verify types + assert('int64' in str(indices_type)), "expected int64 for indices" + # verify shapes + assert(data.ndim == len(shape)) + assert(indices.ndim == 1) + result = RowSparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype, + [indices_type], [indices.shape])) + + # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays + # if they are not for now. In the future, we should provide a c-api + # to accept np.ndarray types to copy from to result.data and aux_data + if not isinstance(data, NDArray): + data = _array(data, ctx, dtype) + if not isinstance(indices, NDArray): + indices = _array(indices, ctx, indices_type) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1))) + check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(0))) + return result + + +def _ndarray_cls(handle, writable=True, stype=None): + if stype is None: + stype = _storage_type(handle) + if stype == 'default': + return NDArray(handle, writable=writable) + elif stype == 'csr': + return CSRNDArray(handle, writable=writable) + elif stype == 'row_sparse': + return RowSparseNDArray(handle, writable=writable) + else: + raise Exception("unknown storage type") + + +_set_ndarray_class(_ndarray_cls) + + +def zeros(stype, shape, ctx=None, dtype=None, aux_types=None, **kwargs): + """Return a new array of given shape and type, filled with zeros. + + Parameters + ---------- + stype: string + The storage type of the empty array, such as 'row_sparse', 'csr', etc + shape : int or tuple of int + The shape of the empty array + ctx : Context, optional + An optional device context (default is the current default context) + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`) + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depends on the storage type) + + Returns + ------- + RowSparseNDArray or CSRNDArray + A created array + Examples + -------- + >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr') + + >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy() + array([[ 0., 0.]], dtype=float16) + """ + if stype == 'default': + return _zeros_ndarray(shape, ctx=ctx, dtype=dtype, **kwargs) + if ctx is None: + ctx = Context.default_ctx + dtype = mx_real_t if dtype is None else dtype + if aux_types is None: + if stype == 'row_sparse' or stype == 'csr': + aux_types = _STORAGE_AUX_TYPES[stype] + else: + raise Exception("unknown storage type") + assert(len(aux_types) == len(_STORAGE_AUX_TYPES[stype])) + out = _ndarray_cls(_new_alloc_handle(stype, shape, ctx, True, dtype, aux_types)) + return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out, **kwargs) + + +def empty(stype, shape, ctx=None, dtype=None, aux_types=None): + """Returns a new array of given shape and type, without initializing entries. + """ + if isinstance(shape, int): + shape = (shape, ) + if ctx is None: + ctx = Context.default_ctx + if dtype is None: + dtype = mx_real_t + assert(stype is not None) + if stype == 'csr' or stype == 'row_sparse': + return zeros(stype, shape, ctx=ctx, dtype=dtype, aux_types=aux_types) + else: + raise Exception("unknown stype : " + str(stype)) + + +def array(source_array, ctx=None, dtype=None, aux_types=None): + """Creates a sparse array from any object exposing the array interface. + """ + if isinstance(source_array, NDArray): + assert(source_array.stype != 'default'), \ + "Please use `cast_storage` to create BaseSparseNDArray from an NDArray" + dtype = source_array.dtype if dtype is None else dtype + aux_types = source_array._aux_types if aux_types is None else aux_types + else: + # TODO(haibin/anisub) support creation from scipy object when `_sync_copy_from` is ready + raise NotImplementedError('creating BaseSparseNDArray from ' \ + ' a non-NDArray object is not implemented.') + arr = empty(source_array.stype, source_array.shape, ctx, dtype, aux_types) + arr[:] = source_array + return arr diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py new file mode 100644 index 000000000000..a0dd83692b87 --- /dev/null +++ b/python/mxnet/ndarray/utils.py @@ -0,0 +1,240 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +"""Utility functions for NDArray and BaseSparseNDArray.""" +import ctypes + +from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle, c_array +from .ndarray import NDArray +from .ndarray import array as _array +from .ndarray import empty as _empty_ndarray +from .ndarray import zeros as _zeros_ndarray +from .sparse import zeros as _zeros_sparse_ndarray +from .sparse import empty as _empty_sparse_ndarray +from .sparse import array as _sparse_array +from .sparse import _ndarray_cls + + +def zeros(shape, ctx=None, dtype=None, stype=None, aux_types=None, **kwargs): + """Return a new array of given shape and type, filled with zeros. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array + ctx : Context, optional + An optional device context (default is the current default context) + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`) + stype: string, optional + The storage type of the empty array, such as 'row_sparse', 'csr', etc. + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depend on the storage type) + + Returns + ------- + NDArray, CSRNDArray or RowSparseNDArray + A created array + Examples + -------- + >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr') + + >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy() + array([[ 0., 0.]], dtype=float16) + """ + + if stype is None or stype == 'default': + return _zeros_ndarray(shape, ctx, dtype, **kwargs) + else: + return _zeros_sparse_ndarray(stype, shape, ctx, dtype, aux_types, **kwargs) + + +def empty(shape, ctx=None, dtype=None, stype=None, aux_types=None): + """Returns a new array of given shape and type, without initializing entries. + + Parameters + ---------- + shape : int or tuple of int + The shape of the empty array. + ctx : Context, optional + An optional device context (default is the current default context). + dtype : str or numpy.dtype, optional + An optional value type (default is `float32`). + stype : str, optional + An optional storage type (default is `default`). + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depend on the storage type) + + Returns + ------- + NDArray, CSRNDArray or RowSparseNDArray + A created array. + + Examples + -------- + >>> mx.nd.empty(1) + + >>> mx.nd.empty((1,2), mx.gpu(0)) + + >>> mx.nd.empty((1,2), mx.gpu(0), 'float16') + + >>> mx.nd.empty((1,2), stype='csr') + + """ + if stype is None or stype == 'default': + return _empty_ndarray(shape, ctx, dtype) + else: + return _empty_sparse_ndarray(stype, shape, ctx, dtype, aux_types) + + +def array(source_array, ctx=None, dtype=None, aux_types=None): + """Creates an array from any object exposing the array interface. + + Parameters + ---------- + source_array : array_like + An object exposing the array interface, an object whose `__array__` + method returns an array, or any (nested) sequence. + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``source_array.dtype`` + if `source_array` is an `NDArray`, `float32` otherwise. + aux_types: list of numpy.dtype, optional + An optional list of types of the aux data for RowSparseNDArray or CSRNDArray + (default values depend on the storage type) + + Returns + ------- + NDArray, RowSparseNDArray or CSRNDArray + An array with the same contents as the `source_array`. + + Examples + -------- + >>> import numpy as np + >>> mx.nd.array([1, 2, 3]) + + >>> mx.nd.array([[1, 2], [3, 4]]) + + >>> mx.nd.array(np.zeros((3, 2))) + + >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0)) + + >>> mx.nd.array(mx.nd.zeros((3, 2), stype='row_sparse')) + + """ + # TODO(haibin/anisub) Check if input is scipy.sparse object with `scipy.sparse.issparse` + if isinstance(source_array, NDArray) and source_array.stype != 'default': + return _sparse_array(source_array, ctx=ctx, dtype=dtype, aux_types=aux_types) + else: + return _array(source_array, ctx=ctx, dtype=dtype) + + +def load(fname): + """Loads an array from file. + + See more details in ``save``. + + Parameters + ---------- + fname : str + The filename. + + Returns + ------- + list of NDArray, RowSparseNDArray or CSRNDArray, or \ + dict of str to NDArray, RowSparseNDArray or CSRNDArray + Loaded data. + """ + if not isinstance(fname, string_types): + raise TypeError('fname required to be a string') + out_size = mx_uint() + out_name_size = mx_uint() + handles = ctypes.POINTER(NDArrayHandle)() + names = ctypes.POINTER(ctypes.c_char_p)() + check_call(_LIB.MXNDArrayLoad(c_str(fname), + ctypes.byref(out_size), + ctypes.byref(handles), + ctypes.byref(out_name_size), + ctypes.byref(names))) + if out_name_size.value == 0: + return [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(out_size.value)] + else: + assert out_name_size.value == out_size.value + return dict( + (py_str(names[i]), _ndarray_cls(NDArrayHandle(handles[i]))) + for i in range(out_size.value)) + + +def save(fname, data): + """Saves a list of arrays or a dict of str->array to file. + + Examples of filenames: + + - ``/path/to/file`` + - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports) + - ``hdfs://path/to/file`` (if compiled with HDFS supports) + + Parameters + ---------- + fname : str + The filename. + data : NDArray, RowSparseNDArray or CSRNDArray, \ + or list of NDArray, RowSparseNDArray or CSRNDArray, \ + or dict of str to NDArray, RowSparseNDArray or CSRNDArray + The data to save. + + Examples + -------- + >>> x = mx.nd.zeros((2,3)) + >>> y = mx.nd.ones((1,4)) + >>> mx.nd.save('my_list', [x,y]) + >>> mx.nd.save('my_dict', {'x':x, 'y':y}) + >>> mx.nd.load('my_list') + [, ] + >>> mx.nd.load('my_dict') + {'y': , 'x': } + """ + if isinstance(data, NDArray): + data = [data] + handles = [] + if isinstance(data, dict): + keys = [] + for key, val in data.items(): + if not isinstance(key, string_types): + raise TypeError('save only accept dict str->NDArray or list of NDArray') + if not isinstance(val, NDArray): + raise TypeError('save only accept dict str->NDArray or list of NDArray') + keys.append(c_str(key)) + handles.append(val.handle) + keys = c_array(ctypes.c_char_p, keys) + elif isinstance(data, list): + for val in data: + if not isinstance(val, NDArray): + raise TypeError('save only accept dict str->NDArray or list of NDArray') + handles.append(val.handle) + keys = None + else: + raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs " + "or a list of NDarrays.") + check_call(_LIB.MXNDArraySave(c_str(fname), + mx_uint(len(handles)), + c_array(NDArrayHandle, handles), + keys)) diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 1ef9cc845036..e7e283f88e43 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -339,8 +339,8 @@ class SGD(Optimizer): state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight weight = weight - state - For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update` and - :class:`~mxnet.ndarray.sgd_mom_update`. + Sparse updating is supported. For details of the update algorithm see + :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`. This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -367,7 +367,8 @@ def create_state(self, index, weight): if self.multi_precision and weight.dtype == numpy.float16: weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32) if self.momentum != 0.0: - momentum = zeros(weight.shape, weight.context, dtype=numpy.float32) + momentum = zeros(weight.shape, weight.context, dtype=numpy.float32, + stype=weight.stype) return (momentum, weight_master_copy) if weight.dtype == numpy.float16 and not self.multi_precision: warnings.warn("Accumulating with float16 in optimizer can lead to " @@ -375,7 +376,7 @@ def create_state(self, index, weight): "Consider using multi_precision=True option of the " "SGD optimizer") if self.momentum != 0.0: - momentum = zeros(weight.shape, weight.context, dtype=weight.dtype) + momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype) return momentum def update(self, index, weight, grad, state): @@ -563,8 +564,10 @@ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, self.epsilon = epsilon def create_state(self, index, weight): - return (zeros(weight.shape, weight.context, dtype=weight.dtype), # mean - zeros(weight.shape, weight.context, dtype=weight.dtype)) # variance + return (zeros(weight.shape, weight.context, dtype=weight.dtype, + stype=weight.stype), # mean + zeros(weight.shape, weight.context, dtype=weight.dtype, + stype=weight.stype)) # variance def update(self, index, weight, grad, state): assert(isinstance(weight, NDArray)) @@ -669,11 +672,11 @@ def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9, def create_state(self, index, weight): if self.centered: return ( - zeros(weight.shape, weight.context), # n - zeros(weight.shape, weight.context), # g - zeros(weight.shape, weight.context)) # delta + zeros(weight.shape, weight.context, stype=weight.stype), # n + zeros(weight.shape, weight.context, stype=weight.stype), # g + zeros(weight.shape, weight.context, stype=weight.stype)) # delta else: - return (zeros(weight.shape, weight.context), ) # n + return (zeros(weight.shape, weight.context, stype=weight.stype),) # n def update(self, index, weight, grad, state): assert(isinstance(weight, NDArray)) diff --git a/python/mxnet/random.py b/python/mxnet/random.py index 29b250d980ce..14bfc2731bd6 100644 --- a/python/mxnet/random.py +++ b/python/mxnet/random.py @@ -22,13 +22,13 @@ import ctypes from .base import _LIB, check_call -from ._ndarray_internal import _sample_uniform as uniform -from ._ndarray_internal import _sample_normal as normal -from ._ndarray_internal import _sample_gamma as gamma -from ._ndarray_internal import _sample_exponential as exponential -from ._ndarray_internal import _sample_poisson as poisson -from ._ndarray_internal import _sample_negbinomial as negative_binomial -from ._ndarray_internal import _sample_gennegbinomial as generalized_negative_binomial +from .ndarray._internal import _sample_uniform as uniform +from .ndarray._internal import _sample_normal as normal +from .ndarray._internal import _sample_gamma as gamma +from .ndarray._internal import _sample_exponential as exponential +from .ndarray._internal import _sample_poisson as poisson +from .ndarray._internal import _sample_negbinomial as negative_binomial +from .ndarray._internal import _sample_gennegbinomial as generalized_negative_binomial def seed(seed_state): """Seeds the random number generators in MXNet. diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py new file mode 100644 index 000000000000..d93a230f490d --- /dev/null +++ b/python/mxnet/symbol/__init__.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Symbol API of MXNet.""" + +from . import _internal, sparse, op +# pylint: disable=wildcard-import, redefined-builtin +from .symbol import * +from ..ndarray import _GRAD_REQ_MAP diff --git a/python/mxnet/_symbol_internal.py b/python/mxnet/symbol/_internal.py similarity index 100% rename from python/mxnet/_symbol_internal.py rename to python/mxnet/symbol/_internal.py diff --git a/python/mxnet/symbol/op.py b/python/mxnet/symbol/op.py new file mode 100644 index 000000000000..82884a5cc6a2 --- /dev/null +++ b/python/mxnet/symbol/op.py @@ -0,0 +1,242 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Register backend ops in mxnet.symbol namespace.""" + +import sys as _sys +import os as _os +import ctypes +import numpy as _numpy # pylint: disable=unused-import + +from mxnet.base import mx_uint, check_call, _LIB, py_str, OpHandle, c_str +from mxnet.symbol_doc import _build_doc + +# Use different version of SymbolBase +# When possible, use cython to speedup part of computation. +# pylint: disable=unused-import +try: + if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: + from .._ctypes.symbol import SymbolBase, _set_symbol_class + from .._ctypes.symbol import _symbol_creator + elif _sys.version_info >= (3, 0): + from .._cy3.symbol import SymbolBase, _set_symbol_class + from .._cy3.symbol import _symbol_creator + else: + from .._cy2.symbol import SymbolBase, _set_symbol_class + from .._cy2.symbol import _symbol_creator +except ImportError: + if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: + raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") + from .._ctypes.symbol import SymbolBase, _set_symbol_class + from .._ctypes.symbol import _symbol_creator + +from ..base import _Null +from ..name import NameManager +from ..attribute import AttrScope +# pylint: enable=unused-import + + +def _make_atomic_symbol_function(handle, name): + """Create an atomic symbol function by handle and function name.""" + real_name = ctypes.c_char_p() + desc = ctypes.c_char_p() + num_args = mx_uint() + arg_names = ctypes.POINTER(ctypes.c_char_p)() + arg_types = ctypes.POINTER(ctypes.c_char_p)() + arg_descs = ctypes.POINTER(ctypes.c_char_p)() + key_var_num_args = ctypes.c_char_p() + ret_type = ctypes.c_char_p() + + check_call(_LIB.MXSymbolGetAtomicSymbolInfo( + handle, ctypes.byref(real_name), ctypes.byref(desc), + ctypes.byref(num_args), + ctypes.byref(arg_names), + ctypes.byref(arg_types), + ctypes.byref(arg_descs), + ctypes.byref(key_var_num_args), + ctypes.byref(ret_type))) + narg = int(num_args.value) + arg_names = [py_str(arg_names[i]) for i in range(narg)] + arg_types = [py_str(arg_types[i]) for i in range(narg)] + func_name = name + key_var_num_args = py_str(key_var_num_args.value) + ret_type = py_str(ret_type.value) if ret_type.value is not None else '' + doc_str = _build_doc(func_name, + py_str(desc.value), + arg_names, + arg_types, + [py_str(arg_descs[i]) for i in range(narg)], + key_var_num_args, + ret_type) + + dtype_name = None + arr_name = None + ndsignature = [] + signature = [] + ndarg_names = [] + kwarg_names = [] + for i in range(narg): + name, atype = arg_names[i], arg_types[i] + if name == 'dtype': + dtype_name = name + signature.append('%s=_Null'%name) + elif atype.startswith('NDArray') or atype.startswith('Symbol'): + assert not arr_name, \ + "Op can only have one argument with variable " \ + "size and it must be the last argument." + if atype.endswith('[]'): + ndsignature.append('*%s'%name) + arr_name = name + else: + ndsignature.append('%s=None'%name) + ndarg_names.append(name) + else: + signature.append('%s=_Null'%name) + kwarg_names.append(name) + #signature.append('is_train=False') + signature.append('name=None') + signature.append('attr=None') + signature.append('out=None') + signature.append('**kwargs') + signature = ndsignature + signature + + code = [] + if arr_name: + code.append(""" +def %s(*%s, **kwargs):"""%(func_name, arr_name)) + code.append(""" + sym_args = [] + for i in {}: + assert isinstance(i, SymbolBase), \\ + "Positional arguments must be Symbol instances, " \\ + "but got %s"%str(i) + sym_args.append(i)""".format(arr_name)) + if dtype_name is not None: + code.append(""" + if '%s' in kwargs: + kwargs['%s'] = _numpy.dtype(kwargs['%s']).name"""%( + dtype_name, dtype_name, dtype_name)) + code.append(""" + attr = kwargs.pop('attr', None) + kwargs.update(AttrScope.current.get(attr)) + name = kwargs.pop('name', None) + name = NameManager.current.get(name, '%s') + _ = kwargs.pop('out', None) + keys = [] + vals = [] + sym_kwargs = dict() + for k, v in kwargs.items(): + if isinstance(v, SymbolBase): + sym_kwargs[k] = v + else: + keys.append(k) + vals.append(v)"""%(func_name.lower())) + if key_var_num_args: + code.append(""" + if '%s' not in kwargs: + keys.append('%s') + vals.append(len(sym_args) + len(sym_kwargs))"""%( + key_var_num_args, key_var_num_args)) + + code.append(""" + return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%( + handle.value)) + else: + code.append(""" +def %s(%s): + kwargs.update(AttrScope.current.get(attr)) + sym_kwargs = dict() + keys = [] + vals = []"""%(func_name, ', '.join(signature))) + code.append(""" + for k, v in kwargs.items(): + if isinstance(v, SymbolBase): + sym_kwargs[k] = v + else: + keys.append(k) + vals.append(v)""") + # NDArray args + for name in ndarg_names: # pylint: disable=redefined-argument-from-local + code.append(""" + if {name} is not None: + assert isinstance({name}, SymbolBase), \\ + "Argument {name} must be Symbol instances, but got %s"%str({name}) + sym_kwargs['{name}'] = {name}""".format(name=name)) + # kwargs + for name in kwarg_names: # pylint: disable=redefined-argument-from-local + code.append(""" + if %s is not _Null: + keys.append('%s') + vals.append(%s)"""%(name, name, name)) + # dtype + if dtype_name is not None: + code.append(""" + if %s is not _Null: + keys.append('%s') + vals.append(_numpy.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name)) + + code.append(""" + name = NameManager.current.get(name, '%s') + return _symbol_creator(%d, None, sym_kwargs, keys, vals, name)"""%( + func_name.lower(), handle.value)) + + local = {} + exec(''.join(code), None, local) # pylint: disable=exec-used + symbol_function = local[func_name] + symbol_function.__name__ = func_name + symbol_function.__doc__ = doc_str + symbol_function.__module__ = 'mxnet.symbol' + return symbol_function + + +def _init_symbol_module(root_namespace): + """List and add all the atomic symbol functions to current module.""" + plist = ctypes.POINTER(ctypes.c_char_p)() + size = ctypes.c_uint() + + check_call(_LIB.MXListAllOpNames(ctypes.byref(size), + ctypes.byref(plist))) + op_names = [] + for i in range(size.value): + op_names.append(py_str(plist[i])) + + module_obj = _sys.modules["%s.symbol" % root_namespace] + module_sparse = _sys.modules["%s.symbol.sparse" % root_namespace] + module_internal = _sys.modules["%s.symbol._internal" % root_namespace] + module_contrib = _sys.modules["%s.contrib.symbol" % root_namespace] + for name in op_names: + hdl = OpHandle() + check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl))) + function = _make_atomic_symbol_function(hdl, name) + if function.__name__.startswith('_contrib_'): + function.__name__ = function.__name__[9:] + function.__module__ = 'mxnet.contrib.symbol' + setattr(module_contrib, function.__name__, function) + elif function.__name__.startswith('_'): + setattr(module_internal, function.__name__, function) + else: + setattr(module_obj, function.__name__, function) + + # register sparse ops under mxnet.symbol.sparse + if function.__name__.startswith('_sparse_'): + function.__name__ = function.__name__[8:] + function.__module__ = 'mxnet.symbol.sparse' + setattr(module_sparse, function.__name__, function) + + +# Initialize the atomic symbol in startups +_init_symbol_module("mxnet") diff --git a/python/mxnet/symbol/sparse.py b/python/mxnet/symbol/sparse.py new file mode 100644 index 000000000000..1d94f2b85bc7 --- /dev/null +++ b/python/mxnet/symbol/sparse.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Sparse Symbol API of MXNet.""" diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol/symbol.py similarity index 90% rename from python/mxnet/symbol.py rename to python/mxnet/symbol/symbol.py index 14cb3811deeb..aa8ca0b8dd53 100644 --- a/python/mxnet/symbol.py +++ b/python/mxnet/symbol/symbol.py @@ -29,39 +29,19 @@ import warnings from numbers import Number -import os as _os -import sys as _sys import numpy as _numpy -from .base import _LIB, numeric_types -from .base import c_array, c_str, mx_uint, py_str, string_types -from .base import NDArrayHandle, ExecutorHandle, SymbolHandle, OpHandle -from .base import check_call, MXNetError, NotImplementedForSymbol, _Null # pylint: disable=unused-import -from .context import Context -from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP -from .name import NameManager # pylint: disable=unused-import -from .executor import Executor -from . import _symbol_internal as _internal -from .attribute import AttrScope -from .symbol_doc import _build_doc - -# Use different version of SymbolBase -# When possible, use cython to speedup part of computation. -try: - if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0: - from ._ctypes.symbol import SymbolBase, _set_symbol_class - from ._ctypes.symbol import _symbol_creator # pylint: disable=unused-import - elif _sys.version_info >= (3, 0): - from ._cy3.symbol import SymbolBase, _set_symbol_class - from ._cy3.symbol import _symbol_creator # pylint: disable=unused-import - else: - from ._cy2.symbol import SymbolBase, _set_symbol_class - from ._cy2.symbol import _symbol_creator # pylint: disable=unused-import -except ImportError: - if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0: - raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1") - from ._ctypes.symbol import SymbolBase, _set_symbol_class - from ._ctypes.symbol import _symbol_creator # pylint: disable=unused-import +from ..base import _LIB, numeric_types +from ..base import c_array, c_str, mx_uint, py_str, string_types +from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle +from ..base import check_call, MXNetError, NotImplementedForSymbol +from ..context import Context +from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP +from ..ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID +from ..ndarray import _ndarray_cls +from ..executor import Executor +from . import _internal, reshape +from .op import SymbolBase, _set_symbol_class, AttrScope, _Null # pylint: disable=unused-import class Symbol(SymbolBase): @@ -1263,8 +1243,9 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing): raise TypeError('Only accept list of NDArrays or dict of str to NDArray') return c_array(NDArrayHandle, arg_handles), arg_arrays - def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, - shared_arg_names=None, shared_exec=None, shared_buffer=None, **kwargs): + def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None, + group2ctx=None, shared_arg_names=None, shared_exec=None, + shared_buffer=None, **kwargs): """Bind current symbol to get an executor, allocate all the arguments needed. Allows specifying data types. @@ -1306,6 +1287,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, type_dict : Dict of str->numpy.dtype Input type dictionary, name->dtype + stype_dict : Dict of str->str + Input storage type dictionary, name->storage_type + group2ctx : Dict of string to mx.Context The dict mapping the `ctx_group` attribute to the context assignment. @@ -1320,7 +1304,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, shared_buffer : Dict of string to `NDArray` The dict mapping argument names to the `NDArray` that can be reused for initializing the current executor. This buffer will be checked for reuse if one argument name - of the current executor is not found in `shared_arg_names`. + of the current executor is not found in `shared_arg_names`. The `NDArray`s are + expected have default storage type. kwargs : Dict of str->shape Input shape dictionary, name->shape @@ -1330,6 +1315,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, executor : mxnet.Executor The generated executor """ + # data types num_provided_arg_types = 0 provided_arg_type_names = ctypes.POINTER(ctypes.c_char_p)() # provided type argument names provided_arg_type_data = ctypes.POINTER(mx_uint)() # provided types @@ -1345,6 +1331,22 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, provided_arg_type_names = c_array(ctypes.c_char_p, provided_arg_type_names) provided_arg_type_data = c_array(ctypes.c_int, provided_arg_type_data) + # storage types + num_provided_arg_stypes = 0 + # provided storage type argument names + provided_arg_stype_names = ctypes.POINTER(ctypes.c_char_p)() + provided_arg_stype_data = ctypes.POINTER(mx_uint)() # provided storage types + if stype_dict is not None: + provided_arg_stype_names = [] + provided_arg_stype_data = [] + for k, v in stype_dict.items(): + if v in _STORAGE_TYPE_STR_TO_ID: + provided_arg_stype_names.append(c_str(k)) + provided_arg_stype_data.append(ctypes.c_int(_STORAGE_TYPE_STR_TO_ID[v])) + num_provided_arg_stypes = mx_uint(len(provided_arg_stype_names)) + provided_arg_stype_names = c_array(ctypes.c_char_p, provided_arg_stype_names) + provided_arg_stype_data = c_array(ctypes.c_int, provided_arg_stype_data) + provided_arg_shape_data = [] # shape data # argument shape index in sdata, # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg @@ -1418,6 +1420,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, shared_buffer_names = [] shared_buffer_handles = [] for k, v in shared_buffer.items(): + assert(v.stype == 'default'), \ + "shared_buffer is expected to only contain NDArrays with default storage" shared_buffer_names.append(c_str(k)) shared_buffer_handles.append(v.handle) shared_buffer_names = c_array(ctypes.c_char_p, shared_buffer_names) @@ -1457,6 +1461,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, num_provided_arg_types, provided_arg_type_names, provided_arg_type_data, + num_provided_arg_stypes, + provided_arg_stype_names, + provided_arg_stype_data, mx_uint(len(shared_arg_name_list)), c_array(ctypes.c_char_p, shared_arg_name_list), ctypes.byref(shared_buffer_len), @@ -1486,11 +1493,12 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None, shared_buffer[k] = v # create in_args, arg_grads, and aux_states for the current executor - arg_arrays = [NDArray(NDArrayHandle(in_arg_handles[i])) for i in range(num_in_args.value)] - grad_arrays = [NDArray(NDArrayHandle(arg_grad_handles[i])) + arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i])) \ + for i in range(num_in_args.value)] + grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i])) if arg_grad_handles[i] is not None else None for i in range(num_in_args.value)] - aux_arrays = [NDArray(NDArrayHandle(aux_state_handles[i])) + aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i])) for i in range(num_aux_states.value)] executor = Executor(exe_handle, self, ctx, grad_req, group2ctx) @@ -1767,7 +1775,8 @@ def detach(self): def backward(self): raise NotImplementedForSymbol(self.backward, None) -def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None, **kwargs): +def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, + init=None, stype=None, **kwargs): """Creates a symbolic variable with specified name. Example usage: @@ -1794,6 +1803,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini The dtype for input variable. If not specified, this value will be inferred. init : initializer (mxnet.init.*) Initializer for this variable to (optionally) override the default initializer. + stype : str + The storage type of the variable. kwargs : Additional attribute variables Additional attributes must start and end with double underscores. @@ -1821,6 +1832,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini if not isinstance(init, string_types): init = init.dumps() attr['__init__'] = init + if stype is not None: + attr['__storage_type__'] = str(_STORAGE_TYPE_STR_TO_ID[stype]) for k, v in kwargs.items(): if k.startswith('__') and k.endswith('__'): attr[k] = str(v) @@ -2195,188 +2208,4 @@ def arange(start, stop=None, step=1.0, repeat=1, name=None, dtype=None): return _internal._arange(start=start, stop=stop, step=step, repeat=repeat, name=name, dtype=dtype) - -def _make_atomic_symbol_function(handle, name): - """Create an atomic symbol function by handle and function name.""" - real_name = ctypes.c_char_p() - desc = ctypes.c_char_p() - num_args = mx_uint() - arg_names = ctypes.POINTER(ctypes.c_char_p)() - arg_types = ctypes.POINTER(ctypes.c_char_p)() - arg_descs = ctypes.POINTER(ctypes.c_char_p)() - key_var_num_args = ctypes.c_char_p() - ret_type = ctypes.c_char_p() - - check_call(_LIB.MXSymbolGetAtomicSymbolInfo( - handle, ctypes.byref(real_name), ctypes.byref(desc), - ctypes.byref(num_args), - ctypes.byref(arg_names), - ctypes.byref(arg_types), - ctypes.byref(arg_descs), - ctypes.byref(key_var_num_args), - ctypes.byref(ret_type))) - narg = int(num_args.value) - arg_names = [py_str(arg_names[i]) for i in range(narg)] - arg_types = [py_str(arg_types[i]) for i in range(narg)] - func_name = name - key_var_num_args = py_str(key_var_num_args.value) - ret_type = py_str(ret_type.value) if ret_type.value is not None else '' - doc_str = _build_doc(func_name, - py_str(desc.value), - arg_names, - arg_types, - [py_str(arg_descs[i]) for i in range(narg)], - key_var_num_args, - ret_type) - - dtype_name = None - arr_name = None - ndsignature = [] - signature = [] - ndarg_names = [] - kwarg_names = [] - for i in range(narg): - name, atype = arg_names[i], arg_types[i] - if name == 'dtype': - dtype_name = name - signature.append('%s=_Null'%name) - elif atype.startswith('NDArray') or atype.startswith('Symbol'): - assert not arr_name, \ - "Op can only have one argument with variable " \ - "size and it must be the last argument." - if atype.endswith('[]'): - ndsignature.append('*%s'%name) - arr_name = name - else: - ndsignature.append('%s=None'%name) - ndarg_names.append(name) - else: - signature.append('%s=_Null'%name) - kwarg_names.append(name) - #signature.append('is_train=False') - signature.append('name=None') - signature.append('attr=None') - signature.append('out=None') - signature.append('**kwargs') - signature = ndsignature + signature - - code = [] - if arr_name: - code.append(""" -def %s(*%s, **kwargs):"""%(func_name, arr_name)) - code.append(""" - sym_args = [] - for i in {}: - assert isinstance(i, SymbolBase), \\ - "Positional arguments must be Symbol instances, " \\ - "but got %s"%str(i) - sym_args.append(i)""".format(arr_name)) - if dtype_name is not None: - code.append(""" - if '%s' in kwargs: - kwargs['%s'] = _numpy.dtype(kwargs['%s']).name"""%( - dtype_name, dtype_name, dtype_name)) - code.append(""" - attr = kwargs.pop('attr', None) - kwargs.update(AttrScope.current.get(attr)) - name = kwargs.pop('name', None) - name = NameManager.current.get(name, '%s') - _ = kwargs.pop('out', None) - keys = [] - vals = [] - sym_kwargs = dict() - for k, v in kwargs.items(): - if isinstance(v, SymbolBase): - sym_kwargs[k] = v - else: - keys.append(k) - vals.append(v)"""%(func_name.lower())) - if key_var_num_args: - code.append(""" - if '%s' not in kwargs: - keys.append('%s') - vals.append(len(sym_args) + len(sym_kwargs))"""%( - key_var_num_args, key_var_num_args)) - - code.append(""" - return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%( - handle.value)) - else: - code.append(""" -def %s(%s): - kwargs.update(AttrScope.current.get(attr)) - sym_kwargs = dict() - keys = [] - vals = []"""%(func_name, ', '.join(signature))) - code.append(""" - for k, v in kwargs.items(): - if isinstance(v, SymbolBase): - sym_kwargs[k] = v - else: - keys.append(k) - vals.append(v)""") - # NDArray args - for name in ndarg_names: # pylint: disable=redefined-argument-from-local - code.append(""" - if {name} is not None: - assert isinstance({name}, SymbolBase), \\ - "Argument {name} must be Symbol instances, but got %s"%str({name}) - sym_kwargs['{name}'] = {name}""".format(name=name)) - # kwargs - for name in kwarg_names: # pylint: disable=redefined-argument-from-local - code.append(""" - if %s is not _Null: - keys.append('%s') - vals.append(%s)"""%(name, name, name)) - # dtype - if dtype_name is not None: - code.append(""" - if %s is not _Null: - keys.append('%s') - vals.append(_numpy.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name)) - - code.append(""" - name = NameManager.current.get(name, '%s') - return _symbol_creator(%d, None, sym_kwargs, keys, vals, name)"""%( - func_name.lower(), handle.value)) - - local = {} - exec(''.join(code), None, local) # pylint: disable=exec-used - symbol_function = local[func_name] - symbol_function.__name__ = func_name - symbol_function.__doc__ = doc_str - symbol_function.__module__ = 'mxnet.symbol' - return symbol_function - - -def _init_symbol_module(symbol_class, root_namespace): - """List and add all the atomic symbol functions to current module.""" - _set_symbol_class(symbol_class) - plist = ctypes.POINTER(ctypes.c_char_p)() - size = ctypes.c_uint() - - check_call(_LIB.MXListAllOpNames(ctypes.byref(size), - ctypes.byref(plist))) - op_names = [] - for i in range(size.value): - op_names.append(py_str(plist[i])) - - module_obj = _sys.modules["%s.symbol" % root_namespace] - module_internal = _sys.modules["%s._symbol_internal" % root_namespace] - module_contrib = _sys.modules["%s.contrib.symbol" % root_namespace] - for name in op_names: - hdl = OpHandle() - check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl))) - function = _make_atomic_symbol_function(hdl, name) - if function.__name__.startswith('_contrib_'): - function.__name__ = function.__name__[9:] - function.__module__ = 'mxnet.contrib.symbol' - setattr(module_contrib, function.__name__, function) - elif function.__name__.startswith('_'): - setattr(module_internal, function.__name__, function) - else: - setattr(module_obj, function.__name__, function) - - -# Initialize the atomic symbol in startups -_init_symbol_module(Symbol, "mxnet") +_set_symbol_class(Symbol) diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index c5587f8d80a8..e1210fbd3e6e 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -31,15 +31,17 @@ from contextlib import contextmanager import numpy as np import numpy.testing as npt -import mxnet as mx -from .context import Context -from .ndarray import array -from .symbol import Symbol +import numpy.random as rnd try: import requests except ImportError: # in rare cases requests may be not installed pass +import mxnet as mx +from .context import Context +from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID +from .ndarray import array +from .symbol import Symbol _rng = np.random.RandomState(1234) @@ -85,6 +87,184 @@ def random_arrays(*shapes): return arrays +def random_sample(population, k): + """Return a k length list of the elements chosen from the population sequence.""" + assert 0 <= k <= len(population) + population_copy = population[:] + np.random.shuffle(population_copy) + return population_copy[0:k] + + +def _validate_csr_generation_inputs(num_rows, num_cols, density, + distribution="uniform"): + """Validates inputs for csr generation helper functions + """ + total_nnz = int(num_rows * num_cols * density) + if density < 0 or density > 1: + raise ValueError("density has to be between 0 and 1") + + if num_rows <= 0 or num_cols <= 0: + raise ValueError("num_rows or num_cols should be greater than 0") + + if distribution == "powerlaw": + if total_nnz < 2 * num_rows: + raise ValueError("not supported for this density: %s" + " for this shape (%s, %s)" + " Please keep :" + " num_rows * num_cols * density >= 2 * num_rows" + % (density, num_rows, num_cols)) + + +def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None): + """Returns CSRNDArray with uniform distribution + This generates a csr matrix with totalnnz unique randomly chosen numbers + from num_rows*num_cols and arranges them in the 2d array in the + following way: row_index = (random_number_generated / num_rows) + col_index = random_number_generated - row_index * num_cols + """ + _validate_csr_generation_inputs(num_rows, num_cols, density, + distribution="uniform") + from scipy import sparse as sp + csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr") + result = mx.nd.sparse.csr_matrix(csr.data, csr.indptr, csr.indices, + (num_rows, num_cols), dtype=dtype) + return result + + +def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None): + """Returns CSRNDArray with powerlaw distribution + with exponentially increasing number of non zeros in each row. + Not supported for cases where total_nnz < 2*num_rows. This is because + the algorithm first tries to ensure that there are rows with no zeros by + putting non zeros at beginning of each row. + """ + + _validate_csr_generation_inputs(num_rows, num_cols, density, + distribution="powerlaw") + + total_nnz = int(num_rows * num_cols * density) + + unused_nnz = total_nnz + output_arr = np.zeros((num_rows, num_cols), dtype=dtype) + # Start with ones on each row so that no row is empty + for row in range(num_rows): + output_arr[row][0] = 1 + rnd.uniform(0.001, 2) + unused_nnz = unused_nnz - 1 + if unused_nnz <= 0: + return mx.nd.array(output_arr).tostype("csr") + + # Populate rest of matrix with 2^i items in ith row. + # if we have used all total nnz return the sparse matrix + # else if we reached max column size then fill up full columns until we use all nnz + col_max = 2 + for row in range(num_rows): + col_limit = min(num_cols, col_max) + # In case col_limit reached assign same value to all elements, which is much faster + if col_limit == num_cols and unused_nnz > col_limit: + output_arr[row] = 1 + rnd.uniform(0.001, 2) + unused_nnz = unused_nnz - col_limit + 1 + if unused_nnz <= 0: + return mx.nd.array(output_arr).tostype("csr") + else: + continue + for col_index in range(1, col_limit): + output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2) + unused_nnz = unused_nnz - 1 + if unused_nnz <= 0: + return mx.nd.array(output_arr).tostype("csr") + col_max = col_max * 2 + + if unused_nnz > 0: + raise ValueError("not supported for this density: %s" + " for this shape (%s,%s)" % (density, num_rows, num_cols)) + else: + return mx.nd.array(output_arr).tostype("csr") + + +def rand_sparse_ndarray(shape, stype, density=None, distribution=None, dtype=None): + """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) + Parameters + ---------- + shape: list or tuple + stype: str, valid values: "csr" or "row_sparse" + density, optional: float, should be between 0 and 1 + distribution, optional: str, valid values: "uniform" or "powerlaw" + dtype, optional: numpy.dtype, default value is None + Returns + ------- + Result of type CSRNDArray or RowSparseNDArray + Examples + -------- + Below is an example of the powerlaw distribution with csr as the stype. + It calculates the nnz using the shape and density. + It fills up the ndarray with exponentially increasing number of elements. + If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row. + else, remaining unused_nnzs will be used in n+1th row + If number of cols is too small and we have already reached column size it will fill up + all following columns in all followings rows until we reach the required density. + + >>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr", + density=0.50, distribution="powerlaw") + >>> indptr = csr_arr.indptr.asnumpy() + >>> indices = csr_arr.indices.asnumpy() + >>> data = csr_arr.data.asnumpy() + >>> row2nnz = len(data[indptr[1]:indptr[2]]) + >>> row3nnz = len(data[indptr[2]:indptr[3]]) + >>> assert(row3nnz == 2*row2nnz) + >>> row4nnz = len(data[indptr[3]:indptr[4]]) + >>> assert(row4nnz == 2*row3nnz) + """ + density = rnd.rand() if density is None else density + dtype = default_dtype() if dtype is None else dtype + distribution = "uniform" if distribution is None else distribution + if stype == 'row_sparse': + assert (distribution == "uniform"), \ + "Distribution %s not supported for row_sparse" % (distribution) + # sample index + idx_sample = rnd.rand(shape[0]) + indices = np.argwhere(idx_sample < density).flatten() + if indices.shape[0] == 0: + result = mx.nd.zeros(shape, stype='row_sparse', dtype=dtype) + return result, (np.array([], dtype=dtype), np.array([], dtype='int64')) + # generate random values + val = rnd.rand(indices.shape[0], *shape[1:]).astype(dtype) + arr = mx.nd.sparse.row_sparse_array(val, indices, shape, indices_type=np.int64, dtype=dtype) + return arr, (val, indices) + elif stype == 'csr': + assert len(shape) == 2 + if distribution == "uniform": + csr = _get_uniform_dataset_csr(shape[0], shape[1], density, dtype=dtype) + return csr, (csr.indptr, csr.indices, csr.data) + elif distribution == "powerlaw": + csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density, dtype=dtype) + return csr, (csr.indptr, csr.indices, csr.data) + else: + assert(False), "Distribution not supported: %s" % (distribution) + else: + assert(False), "unknown storage type" + + +def rand_ndarray(shape, stype, density=None, dtype=None, distribution=None): + if stype == 'default': + arr = mx.nd.array(random_arrays(shape), dtype=dtype) + else: + arr, _ = rand_sparse_ndarray(shape, stype, density=density, dtype=dtype, + distribution=distribution) + return arr + + +def rand_shape_2d(dim0=10, dim1=10): + return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1) + + +def rand_shape_3d(dim0=10, dim1=10, dim2=10): + return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1) + + +def rand_shape_nd(n, dim=10): + return rnd.randint(1, dim+1, size=n) + + def np_reduce(dat, axis, keepdims, numpy_reduce_func): """Compatible reduce for old version of NumPy. @@ -316,7 +496,8 @@ def _parse_location(sym, location, ctx): % (str(set(sym.list_arguments())), str(set(location.keys())))) else: location = {k: v for k, v in zip(sym.list_arguments(), location)} - location = {k: mx.nd.array(v, ctx=ctx) for k, v in location.items()} + location = {k: mx.nd.array(v, ctx=ctx) if isinstance(v, np.ndarray) \ + else v for k, v in location.items()} return location @@ -437,7 +618,8 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2, - atol=None, grad_nodes=None, use_forward_train=True, ctx=None): + atol=None, grad_nodes=None, use_forward_train=True, ctx=None, + grad_stype_dict=None): """Verify an operation by checking backward pass via finite difference method. Based on Theano's `theano.gradient.verify_grad` [1] @@ -454,7 +636,7 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto - if type is dict of str -> numpy.ndarray maps the name of arguments to the corresponding numpy.ndarray. *In either case, value of all the arguments must be provided.* - aux_states : ist or tuple or dict, optional + aux_states : list or tuple or dict, optional The auxiliary states required when generating the executor for the symbol. numeric_eps : float, optional Delta for the finite difference method that approximates the gradient. @@ -466,6 +648,8 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto Whether to use is_train=True when computing the finite-difference. ctx : Context, optional Check the gradient computation on the specified device. + grad_stype_dict : dict of str->str, optional + Storage type dictionary for gradient ndarrays. References --------- ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py @@ -489,7 +673,7 @@ def random_projection(shape): location_npy = {k:v.asnumpy() for k, v in location.items()} aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx) if aux_states is not None: - aux_states_npy = {k:v.asnumpy() for k, v in aux_states.items()} + aux_states_npy = {k: v.asnumpy() for k, v in aux_states.items()} else: aux_states_npy = None if grad_nodes is None: @@ -516,6 +700,14 @@ def random_projection(shape): + [("__random_proj", _rng.normal(0, 0.01, size=out_shape[0]))]) args_grad = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()} + if grad_stype_dict is not None: + assert isinstance(grad_stype_dict, dict), "grad_stype_dict must be a dict" + for k, v in grad_stype_dict.items(): + if k in args_grad and v in _STORAGE_TYPE_STR_TO_ID and v != 'default': + # create an uninitialized sparse ndarray for executor + # if the symbolic grad is expected to be zero, it should not be initialized at all + args_grad[k] = mx.nd.zeros(args_grad[k].shape, args_grad[k].context, + args_grad[k].dtype, v) executor = out.bind(ctx, grad_req=grad_req, args=location, args_grad=args_grad, aux_states=aux_states) @@ -607,15 +799,15 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None, g[:] = 0 executor.forward(is_train=False) - outputs = [x.asnumpy() for x in executor.outputs] + outputs = [x.asnumpy() for x in executor.outputs] for output_name, expect, output in zip(sym.list_outputs(), expected, outputs): assert_almost_equal(expect, output, rtol, atol, ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name)) def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=None, - aux_states=None, grad_req='write', ctx=None): + aux_states=None, grad_req='write', ctx=None, grad_stypes=None): """Compares a symbol's backward results with the expected ones. Prints error messages if the backward results are not the same as the expected results. @@ -651,6 +843,8 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol= Gradient requirements. 'write', 'add' or 'null'. ctx : Context, optional Running context. + grad_stypes: dict of str->str + dictionary of mapping argument name to stype for the gradient Example ------- @@ -676,14 +870,23 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol= if isinstance(expected, (list, tuple)): expected = {k:v for k, v in zip(sym.list_arguments(), expected)} args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()} - args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()} + args_grad_data = {} + for k, v in args_grad_npy.items(): + nd = mx.nd.array(v, ctx=ctx) + if grad_stypes is not None and k in grad_stypes: + args_grad_data[k] = nd.tostype(grad_stypes[k]) + else: + args_grad_data[k] = nd + if isinstance(grad_req, str): grad_req = {k:grad_req for k in sym.list_arguments()} elif isinstance(grad_req, (list, tuple)): grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)} - executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states) + executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, + aux_states=aux_states, grad_req=grad_req) executor.forward(is_train=True) + if isinstance(out_grads, (tuple, list)): out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads] elif isinstance(out_grads, (dict)): diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 93458d21ac5a..0fe3fe3e302e 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -172,6 +172,39 @@ int MXNDArrayCreateEx(const mx_uint *shape, API_END(); } +int MXNDArrayCreateSparseEx(int storage_type, + const mx_uint *shape, + mx_uint ndim, + int dev_type, + int dev_id, + int delay_alloc, + int dtype, + mx_uint num_aux, + int *aux_type, + mx_uint *aux_ndims, + const mx_uint *aux_shape, + NDArrayHandle *out) { + API_BEGIN(); + std::vector aux_types; + std::vector aux_shapes; + auto shape_start = aux_shape; + for (size_t i = 0; i < num_aux; i++) { + // types + aux_types.push_back(aux_type[i]); + // shapes + aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]); + shape_start += aux_ndims[i]; + } + *out = new NDArray( + NDArrayStorageType(storage_type), + TShape(shape, shape + ndim), + Context::Create(static_cast(dev_type), dev_id), + delay_alloc != 0, + dtype, aux_types, aux_shapes); + API_END(); +} + + int MXNDArrayLoadFromRawBytes(const void *buf, size_t size, NDArrayHandle *out) { @@ -215,6 +248,23 @@ int MXNDArraySyncCopyToCPU(NDArrayHandle handle, API_END(); } +/*! + * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0 + * This function blocks. Do not use it in performance critical code. + * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated + * \param handle_src handle of a src ndarray which has default storage type + * \param i dst data blob indicator + */ +int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst, + const NDArrayHandle handle_src, + const int i) { + API_BEGIN(); + NDArray* dst = static_cast(handle_dst); + NDArray* src = static_cast(handle_src); + dst->SyncCopyFromNDArray(*src, -1, i); + API_END(); +} + int MXNDArrayWaitToRead(NDArrayHandle handle) { API_BEGIN(); static_cast(handle)->WaitToRead(); @@ -351,6 +401,18 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle, API_END_HANDLE_ERROR(delete ptr); } +int MXNDArrayGetStorageType(NDArrayHandle handle, + int *out_storage_type) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + if (!arr->is_none()) { + *out_storage_type = arr->storage_type(); + } else { + *out_storage_type = kUndefinedStorage; + } + API_END(); +} + int MXNDArrayGetShape(NDArrayHandle handle, mx_uint *out_dim, const mx_uint **out_pdata) { @@ -400,6 +462,42 @@ int MXNDArrayGetDType(NDArrayHandle handle, API_END(); } +int MXNDArrayGetAuxType(NDArrayHandle handle, + mx_uint i, + int *out_type) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out_type = arr->aux_type(i); + API_END(); +} + +/*! + * \brief Get a deep copy of the ith aux data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +int MXNDArrayGetAuxNDArray(NDArrayHandle handle, + mx_uint i, + NDArrayHandle *out) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out = new NDArray(arr->aux_ndarray(i)); + API_END(); +} + +/*! + * \brief Get a deep copy of the data blob + * in the form of an NDArray of default storage type. + * This function blocks. Do not use it in performance critical code. + */ +int MXNDArrayGetDataNDArray(NDArrayHandle handle, + NDArrayHandle *out) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out = new NDArray(arr->data_ndarray()); + API_END(); +} + int MXNDArrayGetContext(NDArrayHandle handle, int *out_dev_type, int *out_dev_id) { @@ -735,6 +833,24 @@ int MXKVStorePullEx(KVStoreHandle handle, API_END(); } +int MXKVStorePullRowSparse(KVStoreHandle handle, + mx_uint num, + const char** keys, + NDArrayHandle* vals, + const NDArrayHandle* row_ids, + int priority) { + API_BEGIN(); + std::vector v_keys(num); + std::vector> v_val_rowids(num); + for (mx_uint i = 0; i < num; ++i) { + v_keys[i] = keys[i]; + v_val_rowids[i] = std::make_pair(static_cast(vals[i]), + *static_cast(row_ids[i])); + } + static_cast(handle)->PullRowSparse(v_keys, v_val_rowids, priority); + API_END(); +} + int MXKVStoreSetUpdater(KVStoreHandle handle, MXKVStoreUpdater updater, void* updater_handle) { diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h index 846b53973b07..fee3f03f6db0 100644 --- a/src/c_api/c_api_common.h +++ b/src/c_api/c_api_common.h @@ -76,6 +76,8 @@ struct MXAPIThreadLocalEntry { std::vector arg_shapes, out_shapes, aux_shapes; /*! \brief result holder for returning type flags */ std::vector arg_types, out_types, aux_types; + /*! \brief result holder for returning storage types */ + std::vector arg_storage_types, out_storage_types, aux_storage_types; /*! \brief result holder for returning shape dimensions */ std::vector arg_shape_ndim, out_shape_ndim, aux_shape_ndim; /*! \brief result holder for returning shape pointer */ diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc index a4c48e426879..631c1a7d93eb 100644 --- a/src/c_api/c_api_executor.cc +++ b/src/c_api/c_api_executor.cc @@ -198,6 +198,9 @@ int MXExecutorBindEX(SymbolHandle symbol_handle, * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes * \param provided_arg_dtype_names argument name list of provided dtypes * \param provided_arg_dtypes data of provided dtypes + * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types + * \param provided_arg_stype_names argument name list of provided storage types + * \param provided_arg_stypes data of provided storage types * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec * \param shared_arg_name_list parameter name list passed from _bind_ith_exec * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec @@ -230,6 +233,9 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, const mx_uint num_provided_arg_dtypes, const char** provided_arg_dtype_names, const int* provided_arg_dtypes, + const mx_uint num_provided_arg_stypes, + const char** provided_arg_stype_names, + const int* provided_arg_stypes, const mx_uint num_shared_arg_names, const char** shared_arg_name_list, int* shared_buffer_len, @@ -254,7 +260,7 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, // attr_dict for setting up type_dict and arg/aux ctx std::unordered_map> attr_dict; - if (nullptr == provided_arg_dtypes || nullptr != g2c_keys) { + if (nullptr == provided_arg_dtypes || nullptr != g2c_keys || nullptr == provided_arg_stypes) { std::vector> attrs = sym->ListAttrsRecursive(); attr_dict.reserve(attrs.size()); @@ -280,6 +286,23 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, } } + // setup arg_stype_map + std::unordered_map arg_stype_map; + if (nullptr == provided_arg_stypes) { // use attr_dict + for (const auto& arg_name : in_arg_names) { + const auto it = attr_dict.find(arg_name); + if (it == attr_dict.end() || !it->second.count("__storage_type__")) { + arg_stype_map[arg_name] = kDefaultStorage; + } + } + } else { // use user input type_dict + // create stype map for in_args and aux_states + arg_stype_map.reserve(num_provided_arg_stypes); + for (mx_uint i = 0; i < num_provided_arg_stypes; ++i) { + arg_stype_map[provided_arg_stype_names[i]] = provided_arg_stypes[i]; + } + } + // create default ctx Context ctx = Context::Create(static_cast(dev_type), dev_id); // create ctx map @@ -420,9 +443,10 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle, std::vector aux_state_vec; *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec, - aux_state_ctx_vec, arg_shape_map, arg_dtype_map, grad_req_type_vec, - shared_arg_name_set, &in_arg_vec, &arg_grad_vec, &aux_state_vec, - use_shared_buffer? &shared_buffer_map : nullptr, + aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map, + grad_req_type_vec, shared_arg_name_set, &in_arg_vec, + &arg_grad_vec, &aux_state_vec, + use_shared_buffer ? &shared_buffer_map : nullptr, reinterpret_cast(shared_exec_handle)); // copy ndarray ptrs to ret->handles so that front end diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index 3202f55abea7..d392baf45d3e 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -18,7 +18,8 @@ */ /*! - * \file c_api_symbolic.cc + * Copyright (c) 2016 by Contributors + * \file c_api_ndarray.cc * \brief C API of mxnet */ @@ -150,14 +151,17 @@ void SetContext(Context* p_ctx, #endif // MXNET_USE_CUDA } +// Set the shape, dtype and storage type void SetShapeType(const nnvm::Op* op, const nnvm::NodeAttrs& attrs, const Context& ctx, const std::vector& ndinputs, - std::vector* p_ndoutputs) { + std::vector* p_ndoutputs, + int* dispatch_stype) { std::vector& ndoutputs = *p_ndoutputs; static auto& infershape = nnvm::Op::GetAttr("FInferShape"); static auto& infertype = nnvm::Op::GetAttr("FInferType"); + static auto& inferstorage = nnvm::Op::GetAttr("FInferStorageType"); MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); // infer shape std::vector& in_shapes = ret->arg_shapes; @@ -193,9 +197,35 @@ void SetShapeType(const nnvm::Op* op, CHECK(infertype[op](attrs, &in_types, &out_types)); CHECK_EQ(out_types.size(), ndoutputs.size()); + // infer storage type + auto& in_storage_types = ret->arg_storage_types; + auto& out_storage_types = ret->out_storage_types; + in_storage_types.clear(); + out_storage_types.clear(); + for (auto& i : ndinputs) { + in_storage_types.push_back(i.storage_type()); + } + for (auto& i : ndoutputs) { + out_storage_types.push_back(i.storage_type()); + } + if (inferstorage.count(op)) { + CHECK(inferstorage[op](attrs, ctx, &in_storage_types, &out_storage_types)); + CHECK_EQ(out_storage_types.size(), ndoutputs.size()); + } + + bool contains_non_default = common::ContainsNonDefaultStorage(in_storage_types); + contains_non_default |= common::ContainsNonDefaultStorage(out_storage_types); + int kNonDefaultStorage = -2; + *dispatch_stype = contains_non_default ? kNonDefaultStorage : kDefaultStorage; for (size_t i = 0; i < ndoutputs.size(); ++i) { + NDArrayStorageType storage_type = static_cast(out_storage_types[i]); if (ndoutputs[i].is_none()) { - ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]); + // if failed to infer the storage type, assume the output storage is dense + if (storage_type == kDefaultStorage || out_storage_types[i] == kUndefinedStorage) { + ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]); + } else { + ndoutputs[i] = NDArray(storage_type, out_shapes[i], ctx, true, out_types[i]); + } } else { CHECK_EQ(ndoutputs[i].shape(), out_shapes[i]) << i << "th output has invalid shape. " @@ -212,7 +242,7 @@ void SetShapeType(const nnvm::Op* op, void SetDependency(std::vector *p_read_vars, std::vector *p_write_vars, std::vector *p_requested, - std::vector *p_auxidx, + std::vector *p_mutate_idx, const nnvm::Op* op, const nnvm::NodeAttrs& attrs, const Context& ctx, @@ -224,7 +254,7 @@ void SetDependency(std::vector *p_read_vars, std::vector& read_vars = *p_read_vars; std::vector& write_vars = *p_write_vars; std::vector& requested = *p_requested; - std::vector& auxidx = *p_auxidx; + std::vector& mutate_idx = *p_mutate_idx; if (tmp_resource.count(op)) { int ntmp = 0; @@ -250,15 +280,30 @@ void SetDependency(std::vector *p_read_vars, write_vars.push_back(i.var()); } if (mutate.count(op)) { - auxidx = mutate[op](attrs); - std::sort(auxidx.begin(), auxidx.end()); - for (auto & i : auxidx) { + mutate_idx = mutate[op](attrs); + std::sort(mutate_idx.begin(), mutate_idx.end()); + for (auto & i : mutate_idx) { write_vars.push_back(ndinputs[i].var()); } } Engine::Get()->DeduplicateVarHandle(&read_vars, &write_vars); } +inline void SetWriteInplaceReq(const std::vector &ndinputs, + const std::vector &ndoutputs, + std::vector *req) { + std::unordered_set in_vars; + for (auto &nd : ndinputs) { + in_vars.insert(nd.var()); + } + for (size_t i = 0; i < ndoutputs.size(); i++) { + // output NDArray shares the memory with the input NDArray + if (in_vars.find(ndoutputs[i].var()) != in_vars.end()) { + req->at(i) = kWriteInplace; + } + } +} + void PushFCompute(const FCompute& fn, const nnvm::Op* op, const nnvm::NodeAttrs& attrs, @@ -267,24 +312,75 @@ void PushFCompute(const FCompute& fn, const std::vector& write_vars, const std::vector& requested, const std::vector& ndinputs, - const std::vector& ndoutputs) { + const std::vector& ndoutputs, + const std::vector& mutate_idx) { + using namespace common; bool is_train = AutogradRuntime::Get()->IsTraining(); Engine::Get()->PushAsync( - [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train]( + [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train, mutate_idx]( RunContext rctx, engine::CallbackOnComplete on_complete) { std::vector input_blobs, output_blobs; - for (auto& i : ndinputs) { - input_blobs.push_back(i.data()); - } - for (auto& i : ndoutputs) { - output_blobs.push_back(i.data()); + // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays + std::vector pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map; + // populate input blobs and output blobs + SetupDefaultBlobs(ndinputs, &input_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map); + SetupDefaultBlobs(ndoutputs, &output_blobs, &post_temp_dst, &post_temp_src); + // add mutable inputs to post temp list + for (const auto idx : mutate_idx) { + auto map_iter = in_temp_idx_map.find(idx); + if (map_iter != in_temp_idx_map.end()) { + post_temp_src.push_back(pre_temp_dst[map_iter->second]); + post_temp_dst.push_back(ndinputs[idx]); + } } OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested}; std::vector req(output_blobs.size(), kWriteTo); - fn(attrs, opctx, input_blobs, req, output_blobs); + if (ctx.dev_mask() == gpu::kDevMask) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fn(attrs, opctx, input_blobs, req, output_blobs); + // cast to original storage type, if necessary + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); + rctx.get_stream()->Wait(); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fn(attrs, opctx, input_blobs, req, output_blobs); + // cast to original storage type, if necessary + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); + } + on_complete(); + }, ctx, read_vars, write_vars, FnProperty::kNormal, + 0, PROFILER_MESSAGE(op->name.c_str())); +} + +void PushFComputeEx(const FComputeEx& fn, + const nnvm::Op* op, + const nnvm::NodeAttrs& attrs, + const Context& ctx, + const std::vector& read_vars, + const std::vector& write_vars, + const std::vector& requested, + const std::vector& ndinputs, + const std::vector& ndoutputs) { + Engine::Get()->PushAsync( + [ctx, attrs, fn, ndinputs, ndoutputs, requested]( + RunContext rctx, + engine::CallbackOnComplete on_complete) { + std::vector input_blobs, output_blobs; + OpContext opctx{false, rctx, + engine::CallbackOnComplete(), + requested}; + std::vector req(ndoutputs.size(), kWriteTo); + SetWriteInplaceReq(ndinputs, ndoutputs, &req); + fn(attrs, opctx, ndinputs, req, ndoutputs); if (ctx.dev_mask() == gpu::kDevMask) { rctx.get_stream()->Wait(); } @@ -301,7 +397,9 @@ void PushOperator(const OpStatePtr& state, const std::vector& write_vars, const std::vector& requested, const std::vector& ndinputs, - const std::vector& ndoutputs) { + const std::vector& ndoutputs, + const std::vector& mutate_idx) { + using namespace common; static auto& fexec_type = nnvm::Op::GetAttr("FExecType"); bool is_train = AutogradRuntime::Get()->IsTraining(); @@ -314,15 +412,40 @@ void PushOperator(const OpStatePtr& state, if (fcompute != nullptr) { CHECK(exec_type == ExecType::kSync || exec_type == ExecType::kAsync); Engine::Get()->PushAsync( - [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type]( + [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type, mutate_idx]( RunContext rctx, engine::CallbackOnComplete on_complete) { OpContext opctx{is_train, rctx, on_complete, requested}; + std::vector input_blobs, output_blobs; - for (const auto& i : ndinputs) input_blobs.push_back(i.data()); - for (const auto& i : ndoutputs) output_blobs.push_back(i.data()); + // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays + std::vector pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map; + // populate input blobs and output blobs + SetupDefaultBlobs(ndinputs, &input_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map); + SetupDefaultBlobs(ndoutputs, &output_blobs, &post_temp_dst, &post_temp_src); + // add mutable inputs to post temp list + for (const auto idx : mutate_idx) { + if (in_temp_idx_map.find(idx) != in_temp_idx_map.end()) { + post_temp_src.push_back(pre_temp_dst[in_temp_idx_map[idx]]); + post_temp_dst.push_back(ndinputs[idx]); + } + } std::vector req(output_blobs.size(), kWriteTo); - fcompute(state, opctx, input_blobs, req, output_blobs); + if (rctx.get_ctx().dev_mask() == gpu::kDevMask) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fcompute(state, opctx, input_blobs, req, output_blobs); + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx); + fcompute(state, opctx, input_blobs, req, output_blobs); + CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx); + } if (exec_type == ExecType::kSync) { if (rctx.get_ctx().dev_mask() == gpu::kDevMask) { rctx.get_stream()->Wait(); @@ -342,6 +465,7 @@ void PushOperator(const OpStatePtr& state, engine::CallbackOnComplete on_complete) { OpContext opctx{is_train, rctx, on_complete, requested}; std::vector req(ndoutputs.size(), kWriteTo); + SetWriteInplaceReq(ndinputs, ndoutputs, &req); fcompute_ex(state, opctx, ndinputs, req, ndoutputs); if (exec_type == ExecType::kSync) { if (rctx.get_ctx().dev_mask() == gpu::kDevMask) { @@ -363,8 +487,6 @@ void ImperativeInvokeImpl(const Context& default_ctx, const nnvm::NodeAttrs& attrs, std::vector* p_ndinputs, std::vector* p_ndoutputs) { - static auto& fcpu = nnvm::Op::GetAttr("FCompute"); - static auto& fgpu = nnvm::Op::GetAttr("FCompute"); static auto& ndfunc = nnvm::Op::GetAttr("FNDArrayFunction"); static auto& createop = nnvm::Op::GetAttr("FCreateOpState"); MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); @@ -379,29 +501,32 @@ void ImperativeInvokeImpl(const Context& default_ctx, } else { // TODO(piiswrong): infer ctx Context ctx; + int stype; SetContext(&ctx, attrs, ndinputs, ndoutputs, default_ctx); - SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs); + SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs, &stype); std::vector read_vars, write_vars; std::vector requested; - std::vector auxidx; - SetDependency(&read_vars, &write_vars, &requested, &auxidx, + std::vector mutate_idx; + SetDependency(&read_vars, &write_vars, &requested, &mutate_idx, op, attrs, ctx, ndinputs, ndoutputs); - FCompute fn; - if (ctx.dev_mask() == cpu::kDevMask && fcpu.count(op)) { - fn = fcpu[op]; - } else if (ctx.dev_mask() == gpu::kDevMask && fgpu.count(op)) { - fn = fgpu[op]; - } - - if (fn) { + FCompute fn = common::GetFCompute(op, "FCompute", ctx); + FComputeEx fn_ex = common::GetFCompute(op, "FComputeEx", ctx); + if (fn_ex && stype != kDefaultStorage) { if (AutogradRuntime::Get()->IsRecording()) { AutogradRuntime::Get()->RecordImperativeFCompute(op, attrs, &ndinputs, &ndoutputs); } - PushFCompute(fn, op, attrs, ctx, read_vars, write_vars, + PushFComputeEx(fn_ex, op, attrs, ctx, read_vars, write_vars, requested, ndinputs, ndoutputs); + } else if (fn) { + if (AutogradRuntime::Get()->IsRecording()) { + AutogradRuntime::Get()->RecordImperativeFCompute(op, + attrs, &ndinputs, &ndoutputs); + } + PushFCompute(fn, op, attrs, ctx, read_vars, write_vars, + requested, ndinputs, ndoutputs, mutate_idx); } else if (createop.count(op)) { auto state = createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types); @@ -411,7 +536,7 @@ void ImperativeInvokeImpl(const Context& default_ctx, } write_vars.push_back(state.get_var()); PushOperator(state, op, attrs, ctx, read_vars, write_vars, - requested, ndinputs, ndoutputs); + requested, ndinputs, ndoutputs, mutate_idx); } else { LOG(FATAL) << "Operator " << op->name << " is not implemented for " @@ -461,6 +586,28 @@ int MXImperativeInvoke(AtomicSymbolCreator creator, API_END(); } +int MXImperativeInvokeEx(AtomicSymbolCreator creator, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + int num_params, + const char **param_keys, + const char **param_vals, + const int **out_stypes) { // outputs storage types + API_BEGIN(); + MXImperativeInvoke(creator, num_inputs, inputs, num_outputs, outputs, + num_params, param_keys, param_vals); + MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); + NDArray** output_nds = reinterpret_cast(*outputs); + ret->out_types.resize(*num_outputs); + for (int i = 0; i < *num_outputs; ++i) { + ret->out_types[i] = output_nds[i]->storage_type(); + } + *out_stypes = dmlc::BeginPtr(ret->out_types); + API_END(); +} + int MXCreateCachedOp(SymbolHandle handle, CachedOpHandle *out) { nnvm::Symbol* sym = static_cast(handle); @@ -540,6 +687,24 @@ int MXInvokeCachedOp(CachedOpHandle handle, API_END(); } +int MXInvokeCachedOpEx(CachedOpHandle handle, + int num_inputs, + NDArrayHandle *inputs, + int *num_outputs, + NDArrayHandle **outputs, + const int **out_stypes) { // outputs storage types + API_BEGIN(); + MXInvokeCachedOp(handle, num_inputs, inputs, num_outputs, outputs); + MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); + NDArray** output_nds = reinterpret_cast(*outputs); + ret->out_types.resize(*num_outputs); + for (int i = 0; i < *num_outputs; ++i) { + ret->out_types[i] = output_nds[i]->storage_type(); + } + *out_stypes = dmlc::BeginPtr(ret->out_types); + API_END(); +} + int MXAutogradIsTraining(bool* curr) { API_BEGIN(); *curr = AutogradRuntime::Get()->IsTraining(); diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc index e2c29b888ada..d526aea0d35f 100644 --- a/src/c_api/c_api_symbolic.cc +++ b/src/c_api/c_api_symbolic.cc @@ -29,6 +29,7 @@ #include #include "./c_api_common.h" #include "../operator/operator_common.h" +#include "../executor/exec_pass.h" namespace mxnet { namespace op { @@ -459,7 +460,7 @@ int MXSymbolInferShape(SymbolHandle sym, } try { - g = nnvm::pass::InferShape(std::move(g), arg_shapes, "__shape__"); + g = mxnet::exec::InferShape(std::move(g), arg_shapes, "__shape__"); } catch (const mxnet::op::InferShapeError &err) { throw dmlc::Error(err.msg); } @@ -544,7 +545,7 @@ int MXSymbolInferType(SymbolHandle sym, mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_types, "InferType"); } - g = nnvm::pass::InferType(std::move(g), arg_types, "__dtype__"); + g = mxnet::exec::InferType(std::move(g), arg_types, "__dtype__"); // copy back CopyAttr(g.indexed_graph(), g.GetAttr("dtype"), &(ret->arg_types), &(ret->out_types), &(ret->aux_types)); diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc index 5ca01492800e..dda4fda1ed8f 100644 --- a/src/c_api/c_predict_api.cc +++ b/src/c_api/c_predict_api.cc @@ -32,6 +32,7 @@ #include #include "./c_api_common.h" #include "../operator/operator_common.h" +#include "../executor/exec_pass.h" using namespace mxnet; @@ -194,7 +195,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str, } } nnvm::Graph g; g.outputs = sym.outputs; - g = nnvm::pass::InferShape(std::move(g), in_shapes, "__shape__"); + g = mxnet::exec::InferShape(std::move(g), in_shapes, "__shape__"); bool infer_complete = (g.GetAttr("shape_num_unknown_nodes") == 0); CHECK(infer_complete) << "The shape information of is not enough to get the shapes"; diff --git a/src/common/utils.cc b/src/common/utils.cc new file mode 100644 index 000000000000..125e4e5dc7d7 --- /dev/null +++ b/src/common/utils.cc @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file utils.cc + * \brief cpu implementation of util functions + */ + +#include "./utils.h" +#include "../operator/tensor/cast_storage-inl.h" + +namespace mxnet { +namespace common { + +template<> +void CastStorageDispatch(const OpContext& ctx, + const NDArray& input, + const NDArray& output) { + mxnet::op::CastStorageComputeImpl(ctx, input, output); +} + +} // namespace common +} // namespace mxnet diff --git a/src/common/utils.cu b/src/common/utils.cu new file mode 100644 index 000000000000..093480a98907 --- /dev/null +++ b/src/common/utils.cu @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file utils.cu + * \brief gpu implementation of util functions + */ + +#include "./utils.h" +#include "../operator/tensor/cast_storage-inl.h" + +namespace mxnet { +namespace common { + +template<> +void CastStorageDispatch(const OpContext& ctx, + const NDArray& input, + const NDArray& output) { + mxnet::op::CastStorageComputeImpl(ctx, input, output); +} + +} // namespace common +} // namespace mxnet diff --git a/src/common/utils.h b/src/common/utils.h index 85e30970f1a0..92631a9b5c34 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -24,7 +24,14 @@ #ifndef MXNET_COMMON_UTILS_H_ #define MXNET_COMMON_UTILS_H_ -#if DMLC_USE_CXX11 +#include +#include +#include +#include +#include +#include +#include + #include #include #include @@ -33,15 +40,100 @@ #include #include #include -#endif // DMLC_USE_CXX11 - -#include -#include +#include namespace mxnet { namespace common { -#if DMLC_USE_CXX11 +template +void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output); + +/* + * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default + * storage, it creates a temp NDArray with default storage and uses the temp tblob. The + * function also records the indices of non-default source NDArrays and the indices of + * their corresponding temporary NDArrays in the temp array. + * \param src list of source NDArray + * \param blobs list of tblobs to return + * \param temp_src list of source NDArrays which requires temporary default storage representation + * \param temp_dst list of temporary destination NDArrays for default storage representation + * \param idx_map mapping from indices in source NDArrays to indices in temp_dst. When not set, + indices are not recorded + * \return true if any source NDArray need to cast storage + */ +inline bool SetupDefaultBlobs(const std::vector& src, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst, + std::unordered_map *idx_map = nullptr) { + bool require_cast = false; + for (size_t i = 0; i < src.size(); i++) { + auto& nd = src[i]; + if (nd.storage_type() != kDefaultStorage) { + if (idx_map != nullptr) { + (*idx_map)[i] = temp_dst->size(); + } + NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype()); + temp_src->emplace_back(nd); + temp_dst->emplace_back(temp); + blobs->emplace_back(temp.data()); + require_cast = true; + } else { + blobs->push_back(nd.data()); + } + } + return require_cast; +} + +/* + * \brief cast the NDArrays in `src` and store the result in NDArrays in `dst`. + * This is only used for storage fallback in executor. + * When storage_fallback is false, and `MXNET_EXEC_STORAGE_FALLBACK` == 0, + * storage fallback is disallowed. + * \param src list of source NDArray to cast + * \param dst list of destionation NDArray which hold the result of cast_storage operation + * \param ctx operator context for cast_storage operation + * \param storage_fallback whether storage_fallback is allowed. When set to false, + * its value depends on `MXNET_EXEC_STORAGE_FALLBACK`. + */ +template +inline void CastNonDefaultStorage(const std::vector& src, + const std::vector& dst, + const OpContext& ctx, + bool storage_fallback = false) { + CHECK_GE(dst.size(), src.size()); + if (src.size() == 0) return; + if (storage_fallback == false) { + storage_fallback = dmlc::GetEnv("MXNET_EXEC_STORAGE_FALLBACK", true); + } + if (storage_fallback == false) { + LOG(FATAL) << "Storage type conversion detected during execution. " + << "You are probably executing an operator which " + << "doesn't support NDArray inputs with non-default storage."; + } + for (size_t i = 0; i < src.size(); i++) { + CastStorageDispatch(ctx, src[i], dst[i]); + } +} + +// Check if any storage type is not default storage +inline bool ContainsNonDefaultStorage(const StorageTypeVector& vstorage) { + for (const auto& i : vstorage) { + if (i != kUndefinedStorage && i != kDefaultStorage) return true; + } + return false; +} + +// Check if any NDArray in the list has default storage +inline bool ContainsDefaultStorage(const std::vector& ndarrays) { + for (const auto &nd : ndarrays) { + if (nd.storage_type() == kDefaultStorage) { + return true; + } + } + return false; +} + // heuristic to dermine number of threads per GPU inline int GetNumThreadPerGPU() { // This is resource efficient option. @@ -56,6 +148,67 @@ inline int GetExecNumMatchColor() { return std::min(num_match_color, GetNumThreadPerGPU()); } +template +V ParallelAccumulate(const T* a, const int n, V start) { + V sum = start; +#pragma omp parallel for reduction(+:sum) + for (int i = 0; i < n; ++i) { + sum += a[i]; + } + return sum; +} + +/*! + * \brief + * Helper function for ParallelSort. + * DO NOT call this function directly. + * Use the interface ParallelSort instead. + * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h + */ +template +void ParallelSortHelper(RandomIt first, size_t len, + size_t grainsize, const Compare& comp) { + if (len < grainsize) { + std::sort(first, first+len, comp); + } else { + std::thread thr(ParallelSortHelper, first, len/2, grainsize, comp); + ParallelSortHelper(first+len/2, len - len/2, grainsize, comp); + thr.join(); + std::inplace_merge(first, first+len/2, first+len, comp); + } +} + +/*! + * \brief + * Sort the elements in the range [first, last) into the ascending order defined by + * the comparator comp. + * If the length of the range [first, last) is greater than a certain threshold, + * the range will be recursively divided into two and assign two threads + * to sort each half range. + * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h + */ +template +void ParallelSort(RandomIt first, RandomIt last, size_t num_threads, Compare comp) { + const auto num = std::distance(first, last); + size_t grainsize = std::max(num / num_threads + 5, static_cast(1024*16)); + ParallelSortHelper(first, num, grainsize, comp); +} + +/*! + * \brief + * Sort the elements in the range [first, last) into ascending order. + * The elements are compared using the default < operator. + * If the length of the range [first, last) is greater than a certain threshold, + * the range will be recursively divided into two and assign two threads + * to sort each half range. + * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h + */ +template +void ParallelSort(RandomIt first, RandomIt last, size_t num_threads) { + ParallelSort(first, last, num_threads, + std::less::value_type>()); +} + /*! * \brief Random Engine */ @@ -159,8 +312,6 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name, } } -#endif // DMLC_USE_CXX11 - } // namespace common } // namespace mxnet #endif // MXNET_COMMON_UTILS_H_ diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 47b74758d702..fe8cc653bbc3 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "../common/utils.h" #include "./exec_pass.h" @@ -40,33 +41,98 @@ const OperatorProperty* OpPropGetOpProperty(const NodeAttrs& attrs); namespace exec { -// forward executor -class StatefulComputeExecutor : public OpExecutor { +// abstract OpExecutor which provides storage fallback procedure on +// non-default inputs and outputs +// FComputeExecutor and FStatefulComputeExecutor inherit from this class +class StorageFallbackOpExecutor : public OpExecutor { public: - void Run(RunContext rctx) override { + explicit StorageFallbackOpExecutor(const std::vector &mutate_idx) + : mutate_idx_(mutate_idx) {} + + void Setup() override { + init_ = false; + } + + protected: + // initialize the data blobs + void InitBlobs() { + using namespace common; if (!init_) { - in_data_.clear(); - for (size_t i = 0; i < in_array.size(); ++i) { - in_data_.push_back(in_array[i].data()); - } - out_data_.clear(); - for (size_t i = 0; i < out_array.size(); ++i) { - out_data_.push_back(out_array[i].data()); + in_data_.clear(); out_data_.clear(); + pre_temp_src_.clear(); pre_temp_dst_.clear(); + post_temp_src_.clear(); post_temp_dst_.clear(); + in_temp_idx_map_.clear(); + SetupDefaultBlobs(in_array, &in_data_, &pre_temp_src_, &pre_temp_dst_, &in_temp_idx_map_); + SetupDefaultBlobs(out_array, &out_data_, &post_temp_dst_, &post_temp_src_); + for (const auto idx : mutate_idx_) { + auto map_iter = in_temp_idx_map_.find(idx); + if (map_iter != in_temp_idx_map_.end()) { + post_temp_src_.push_back(pre_temp_dst_[map_iter->second]); + post_temp_dst_.push_back(in_array[idx]); + } } init_ = true; } + } + + // storage fallback before fcompute is launched + void PreFCompute(bool is_gpu) { + using namespace common; + InitBlobs(); + if (is_gpu) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx); + } + } + + // storage fallback after fcompute is completed + void PostFCompute(bool is_gpu) { + using namespace common; + if (is_gpu) { +#if MXNET_USE_CUDA + CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx); + } + } + + // default storage tensor blobs for fcompute + std::vector in_data_, out_data_; + // source NDArray for cast storage + std::vector pre_temp_src_, post_temp_src_; + // destination NDArray for cast storage + std::vector pre_temp_dst_, post_temp_dst_; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map_; + // indices of mutatable inputs + std::vector mutate_idx_; + // whether blobs are initialized + bool init_; +}; + + +// stateful compute executor +class StatefulComputeExecutor : public StorageFallbackOpExecutor { + public: + void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; + PreFCompute(is_gpu); fcompute_(state_, op_ctx, in_data_, req, out_data_); + PostFCompute(is_gpu); #if MKL_EXPERIMENTAL == 1 mkl_tblobs_prv_to_cpu(in_data_); mkl_tblobs_prv_to_cpu(out_data_); #endif } - void Setup() override { - init_ = false; - } - ExecType exec_type() const override { return exec_type_; } @@ -77,23 +143,23 @@ class StatefulComputeExecutor : public OpExecutor { explicit StatefulComputeExecutor(const OpStatePtr& state, const FStatefulCompute& fcompute, - ExecType exec_type) - : state_(state), fcompute_(fcompute), exec_type_(exec_type) {} + ExecType exec_type, + const std::vector &mutate_idx) + : StorageFallbackOpExecutor(mutate_idx), + state_(state), fcompute_(fcompute), exec_type_(exec_type) {} private: friend Graph AttachOpExecs(Graph g); OpStatePtr state_; FStatefulCompute fcompute_; ExecType exec_type_; - bool init_; - std::vector in_data_, out_data_; }; -// forward executor +// stateful compute_ex executor class StatefulComputeExExecutor : public OpExecutor { public: - void Run(RunContext rctx) override { + void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; fcompute_(state_, op_ctx, in_array, req, out_array); } @@ -121,47 +187,60 @@ class StatefulComputeExExecutor : public OpExecutor { }; -// fcompute executor executor -class FComputeExecutor : public OpExecutor { +// fcompute executor +class FComputeExecutor : public StorageFallbackOpExecutor { public: - void Run(RunContext rctx) override { - if (!init_) { - in_data_.resize(in_array.size()); - out_data_.resize(out_array.size()); - auto get_blob = [](const NDArray& nd) { - return nd.data(); - }; - std::transform(in_array.begin(), in_array.end(), in_data_.begin(), get_blob); - std::transform(out_array.begin(), out_array.end(), out_data_.begin(), get_blob); - init_ = true; - } + void Run(RunContext rctx, bool is_gpu) override { + using namespace common; op_ctx.run_ctx = rctx; + PreFCompute(is_gpu); fcompute_(attrs_, op_ctx, in_data_, req, out_data_); + PostFCompute(is_gpu); #if MKL_EXPERIMENTAL == 1 mkl_tblobs_prv_to_cpu(in_data_); mkl_tblobs_prv_to_cpu(out_data_); #endif } - void Setup() override { - init_ = false; + ExecType exec_type() const override { + return exec_type_; } + explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, + ExecType exec_type, const std::vector &mutate_idx) + : StorageFallbackOpExecutor(mutate_idx), + attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { + } + + private: + NodeAttrs attrs_; + FCompute fcompute_; + ExecType exec_type_; +}; + +// fcompute_ex executor +class FComputeExExecutor : public OpExecutor { + public: + void Run(RunContext rctx, bool is_gpu) override { + op_ctx.run_ctx = rctx; + fcompute_(attrs_, op_ctx, in_array, req, out_array); + } + + void Setup() override {} + ExecType exec_type() const override { return exec_type_; } - explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, - ExecType exec_type) + explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute, + ExecType exec_type) : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { } private: NodeAttrs attrs_; - FCompute fcompute_; + FComputeEx fcompute_; ExecType exec_type_; - bool init_; - std::vector in_data_, out_data_; }; // pass to attach operator executors @@ -180,6 +259,8 @@ Graph AttachOpExecs(Graph g) { const auto& vctx = g.GetAttr("context"); const auto& saved_states = g.GetAttr< std::unordered_map >("saved_states"); + const auto& dispatch_stypes = g.GetAttr("dispatch_stypes"); + // get the graph const auto& idx = g.indexed_graph(); @@ -217,7 +298,8 @@ Graph AttachOpExecs(Graph g) { FStatefulCompute fcompute = common::GetFCompute( op, "FStatefulCompute", vctx[i]); if (fcompute != nullptr) { - ret[i] = std::make_shared(state, fcompute, exec_type); + ret[i] = std::make_shared(state, fcompute, + exec_type, mutate_index); } else { FStatefulComputeEx fcompute_ex = common::GetFCompute( op, "FStatefulComputeEx", vctx[i]); @@ -236,7 +318,7 @@ Graph AttachOpExecs(Graph g) { if (fcompute != nullptr) { ret[i] = std::make_shared( dynamic_cast(ret[fwd_id].get())->state_, - fcompute, exec_type); + fcompute, exec_type, mutate_index); } else { FStatefulComputeEx fcompute_ex = common::GetFCompute( op, "FStatefulComputeEx", vctx[i]); @@ -249,11 +331,15 @@ Graph AttachOpExecs(Graph g) { } } else { FCompute fcompute = common::GetFCompute(op, "FCompute", vctx[i]); - if (fcompute != nullptr) { + FComputeEx fcomp_ex = common::GetFCompute(op, "FComputeEx", vctx[i]); + if (fcomp_ex != nullptr && dispatch_stypes[i] != kDefaultStorage) { + ret[i] = std::make_shared( + inode.source->attrs, fcomp_ex, exec_type); + } else if (fcompute != nullptr) { ret[i] = std::make_shared( - inode.source->attrs, fcompute, exec_type); + inode.source->attrs, fcompute, exec_type, mutate_index); } else { - LOG(FATAL) << "FCompute not registered " << op->name; + LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name; } } } diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index 0eda71d98214..326262147b9f 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -27,9 +27,12 @@ #include #include #include +#include #include +#include #include #include +#include namespace mxnet { namespace exec { @@ -37,6 +40,12 @@ namespace exec { /*! \brief reuse graph definition */ using nnvm::Graph; +const int kBadStorageID = -1; +const int kExternalStorageID = -2; +const int kDynamicStorageID = -3; + +const int kNonDefaultStorage = -2; + /*! * \brief executor to execute an operator * This is a graph executor dependent interface @@ -44,7 +53,7 @@ using nnvm::Graph; */ class OpExecutor { public: - /*! \brief input arrays */ + /*! \brief input data arrays, which may be either input or aux */ std::vector in_array; /*! \brief output data arrays */ std::vector out_array; @@ -65,7 +74,7 @@ class OpExecutor { * This function call do not synchronize the stream. * \param rctx The runtime context passed in by environment. */ - virtual void Run(RunContext rctx) = 0; + virtual void Run(RunContext rctx, bool is_gpu) = 0; /*! \return the execution type */ virtual ExecType exec_type() const = 0; /*! \return return engine variable for operator states */ @@ -123,6 +132,45 @@ Graph AttachOpResources(Graph g); */ Graph DetectInplaceAddTo(Graph g); +/*! + * \brief Infer shapes in the graph given the information. + * \param graph The input graph. + * \param shape_inputs The shapes of input symbols to the graph. + * \param shape_attr_key The key to the node attribute that can indicate shape. This is + * the place where manual hint for shapes could be injected. + * \return A graph with new attribute "shape" containing inferred shape of each NodeEntry. + * The index of ShapeVector is given by graph.indexed_graph().entry_id. + */ +Graph InferShape(Graph graph, + nnvm::ShapeVector shape_inputs, + const std::string& shape_attr_key = ""); + +/*! + * \brief Infer types in the graph given the information. + * \param graph The input graph. + * \param dtype_inputs The types of input symbols to the graph. + * \param dtype_attr_key The key to the node attribute that can indicate types. This is + * the place where manual hint for types could be injected. + * \return A graph with new attribute "dtype" containing inferred type of each NodeEntry. + * The index of ShapeVector is given by graph.indexed_graph().entry_id. + */ +Graph InferType(Graph graph, + nnvm::DTypeVector dtype_inputs, + const std::string& dtype_attr_key = ""); + +/*! + * \brief Infer storage types in the graph given the information. + * \param graph The input graph. + * \param storage_type_inputs The storage types of input symbols to the graph. + * \param storage_type_attr_key The key to the node attribute that can indicate storage types. + This is the place where manual hint for types could be injected. + * \return A graph with new attribute "storage_type" containing inferred type of each NodeEntry. + * The index of StorageTypeVector is given by graph.indexed_graph().entry_id. + */ +Graph InferStorageType(Graph graph, + StorageTypeVector storage_type_inputs, + const std::string& storage_type_attr_key = ""); + } // namespace exec } // namespace mxnet diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 6dc8cf39970e..9c4398343b1c 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -30,9 +30,15 @@ #include "./exec_pass.h" #include "./graph_executor.h" #include "../engine/profiler.h" +#include "../common/utils.h" namespace mxnet { namespace exec { + +GraphExecutor::GraphExecutor() { + log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false); +} + GraphExecutor::~GraphExecutor() { for (auto& n : op_nodes_) { if (n.cached_opr != nullptr) { @@ -47,6 +53,30 @@ GraphExecutor::~GraphExecutor() { } } +inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape, + const Context &ctx, const int dtype) { + // NDArray with default storage + if (stype == kDefaultStorage) { + NDArray ret(shape, ctx, false, dtype); + ret = 0; + return ret; + } + // NDArray with non-default storage. Storage allocation is always delayed. + return NDArray(stype, shape, ctx, true, dtype); +} + +inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape, + const Context &ctx, const int dtype, + std::vector *vec) { + // NDArray with default storage + if (stype == kDefaultStorage) { + vec->emplace_back(shape, ctx, false, dtype); + vec->back() = 0; + } else { + // NDArray with non-default storage. Storage allocation is always delayed. + vec->emplace_back(stype, shape, ctx, true, dtype); + } +} void GraphExecutor::Forward(bool is_train) { RunOps(is_train, 0, num_forward_nodes_); } @@ -438,6 +468,29 @@ void HandleInferTypeError(const size_t num_forward_inputs, << oss.str(); } +void HandleInferStorageTypeError(const size_t num_forward_inputs, + const nnvm::IndexedGraph& idx, + const StorageTypeVector& inferred_stypes) { + int cnt = 10; + std::ostringstream oss; + for (size_t i = 0; i < num_forward_inputs; ++i) { + const uint32_t nid = idx.input_nodes().at(i); + const uint32_t eid = idx.entry_id(nid, 0); + const int inferred_stype = inferred_stypes[eid]; + if (inferred_stype == -1) { + const std::string& arg_name = idx[nid].source->attrs.name; + oss << arg_name << ": " << inferred_stype << ", "; + if (--cnt == 0) { + oss << "..."; + break; + } + } + } + LOG(FATAL) << "InferStoragetType pass cannot decide storage type for the following arguments " + "(-1 means unknown stype). Please consider providing them as inputs:\n" + << oss.str(); +} + /*! * \brief GraphExecutor initializer for regular bind flow in which * input arguments and gradients are provided by users. This initializer @@ -475,21 +528,25 @@ void GraphExecutor::Init(nnvm::Symbol symbol, data_entry_.resize(idx.num_node_entries()); nnvm::ShapeVector arg_shapes; nnvm::DTypeVector arg_dtypes; + StorageTypeVector arg_stypes; for (size_t i = 0; i < num_forward_inputs_; ++i) { const uint32_t nid = idx.input_nodes().at(i); const std::string& arg_name = idx[nid].source->attrs.name; + size_t eid = idx.entry_id(nid, 0); if (mutable_nodes.count(nid)) { CHECK_LT(aux_top, aux_states.size()); - data_entry_[idx.entry_id(nid, 0)] = aux_states[aux_top]; + data_entry_[eid] = aux_states[aux_top]; arg_shapes.push_back(aux_states[aux_top].shape()); arg_dtypes.push_back(aux_states[aux_top].dtype()); + arg_stypes.push_back(aux_states[aux_top].storage_type()); aux_state_map_.emplace(arg_name, aux_states[aux_top]); ++aux_top; } else { CHECK_LT(arg_top, in_args.size()); - data_entry_[idx.entry_id(nid, 0)] = in_args[arg_top]; + data_entry_[eid] = in_args[arg_top]; arg_shapes.push_back(in_args[arg_top].shape()); arg_dtypes.push_back(in_args[arg_top].dtype()); + arg_stypes.push_back(in_args[arg_top].storage_type()); in_arg_map_.emplace(arg_name, in_args[arg_top]); if (kNullOp != grad_req_types[arg_top]) { grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_store[arg_top]); @@ -497,23 +554,33 @@ void GraphExecutor::Init(nnvm::Symbol symbol, } ++arg_top; } + if (log_verbose_) { + LOG(INFO) << "\tassign data entry\t" << eid << " as stype " + << data_entry_[eid].storage_type() << " (input)"; + } } // expand arg_shapes and arg_dtypes to contain backward inputs arg_shapes.resize(idx.input_nodes().size(), TShape()); - g = nnvm::pass::InferShape(g, arg_shapes, "__shape__"); + g = InferShape(std::move(g), arg_shapes, "__shape__"); if (g.GetAttr("shape_num_unknown_nodes") != 0U) { HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("shape")); } arg_dtypes.resize(idx.input_nodes().size(), -1); - g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__"); + g = InferType(std::move(g), arg_dtypes, "__dtype__"); if (g.GetAttr("dtype_num_unknown_nodes") != 0U) { HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("dtype")); } + g = InferStorageType(std::move(g), arg_stypes, "__storage_type__"); + if (g.GetAttr("storage_type_num_unknown_nodes") != 0U) { + HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), + g.GetAttr("storage_type")); + } + // Initialize the rest attributes of the graph. // This function can be called by regular bind // operation flow as well. @@ -529,6 +596,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -546,22 +614,37 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const uint32_t eid = idx.entry_id(nid, 0); const TShape& inferred_shape = inferred_shapes[eid]; const int inferred_dtype = inferred_dtypes[eid]; + const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; const std::string& arg_name = idx[nid].source->attrs.name; if (mutable_nodes.count(nid)) { // aux_states - aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top], false, inferred_dtype); - aux_state_vec->back() = 0; + EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top], + inferred_dtype, aux_state_vec); data_entry_[eid] = aux_state_vec->back(); aux_state_map_.emplace(arg_name, aux_state_vec->back()); ++aux_top; + if (log_verbose_) { + LOG(INFO) << "\tassign aux entry\t" << eid << "\t as stype " << inferred_stype; + } } else { // in_args - in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype); - in_arg_vec->back() = 0; + EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], + inferred_dtype, in_arg_vec); data_entry_[eid] = in_arg_vec->back(); + if (log_verbose_) { + LOG(INFO) << "\tassign data entry\t" << eid << "\tas stype " << inferred_stype; + } + // Get the storage type for grad if (kNullOp == grad_req_types[arg_top]) { arg_grad_vec->emplace_back(); } else { - arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top], false, inferred_dtype); - arg_grad_vec->back() = 0; + // Init based on storage type + auto grad_oid = grad_store_.size() + num_forward_outputs_; + auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); + auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; + EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], + inferred_dtype, arg_grad_vec); + if (log_verbose_) { + LOG(INFO) << "\tassign grad entry\t" << grad_eid << "\tas stype " << grad_stype; + } grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); arg_grad_map_.emplace(arg_name, arg_grad_vec->back()); } @@ -573,33 +656,40 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, /*! * \brief If the requested ndarray's shape size is less than - * the corresponding shared_data_array's shape size, reuse - * the memory allocation; otherwise, create a zero ndarray. + * the corresponding shared_data_array's shape size and the + * storage type is default storage, reuse the memory allocation + * in shared_buffer; otherwise, create a zero ndarray. */ NDArray ReshapeOrCreate(const std::string& name, const TShape& dest_arg_shape, const int dest_arg_dtype, + const NDArrayStorageType dest_arg_stype, const Context& ctx, std::unordered_map* shared_buffer) { + if (dest_arg_dtype != kDefaultStorage) { + return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); + } auto it = shared_buffer->find(name); if (it != shared_buffer->end()) { if (it->second.shape().Size() >= dest_arg_shape.Size()) { // memory can be reused CHECK_EQ(it->second.dtype(), dest_arg_dtype) << "Requested arg array's dtype does not match the reusable ndarray"; + CHECK_EQ(it->second.storage_type(), kDefaultStorage) + << "shared_buffer should only contain NDArrays with default storage type."; return it->second.Reshape(dest_arg_shape); } else { LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape << ", which is larger than already allocated shape " << it->second.shape() << ". Need to re-allocate. Consider putting default bucket key to be " << "the bucket taking the largest input for better memory sharing."; - it->second = NDArray(dest_arg_shape, ctx, false, dest_arg_dtype); - it->second = 0; + // the NDArrays in shared_buffer are guaranteed to be of default storage + it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); return it->second; } // arg_array.shape().Size() >= arg_shape.Size() } else { - auto p = shared_buffer->emplace(name, NDArray(dest_arg_shape, ctx, false, dest_arg_dtype)); - p.first->second = 0; - return p.first->second; + auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype); + shared_buffer->emplace(name, ret); + return ret; } // if (it != shared_buffer->end()) } @@ -612,6 +702,7 @@ NDArray ReshapeOrCreate(const std::string& name, void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -631,9 +722,12 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const uint32_t eid = idx.entry_id(nid, 0); const TShape& inferred_shape = inferred_shapes[eid]; const int inferred_dtype = inferred_dtypes[eid]; + const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid]; const std::string& arg_name = idx[nid].source->attrs.name; - if (mutable_nodes.count(nid)) { // aux_states - if (nullptr != shared_exec) { + // aux_states + if (mutable_nodes.count(nid)) { + if (nullptr != shared_exec && inferred_stype == kDefaultStorage && + shared_exec->aux_state_map().at(arg_name).storage_type() == kDefaultStorage) { const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name); CHECK_EQ(inferred_shape, aux_nd.shape()) << "Inferred shape does not match shared_exec.aux_array's shape." @@ -647,16 +741,18 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, << arg_name << " for the current executor"; aux_state_vec->emplace_back(aux_nd); } else { - aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top], - false, inferred_dtype); - aux_state_vec->back() = 0; + EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top], + inferred_dtype, aux_state_vec); } // if (has_shared_exec) data_entry_[eid] = aux_state_vec->back(); aux_state_map_.emplace(arg_name, aux_state_vec->back()); ++aux_top; - } else { // in_args + } else { // in_args and grad for in_args if (shared_arg_names.count(arg_name)) { // model parameter - if (nullptr != shared_exec) { + // model parameter + if (nullptr != shared_exec && inferred_stype == kDefaultStorage && + shared_exec->in_arg_map().at(arg_name).storage_type() == kDefaultStorage) { + // try to reuse memory from shared_exec const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name); CHECK_EQ(inferred_shape, in_arg_nd.shape()) << "Inferred shape does not match shared_exec.arg_array's shape" @@ -669,33 +765,43 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, " be resued for creating NDArray of the argument" << arg_name << " for the current executor"; in_arg_vec->emplace_back(in_arg_nd); - if (kNullOp == grad_req_types[arg_top]) { - arg_grad_vec->emplace_back(); - } else { + } else { + // doesn't have shared_exec, or non-default storage + EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top], + inferred_dtype, in_arg_vec); + } + // gradient for model parameter + if (kNullOp == grad_req_types[arg_top]) { + arg_grad_vec->emplace_back(); + } else { + auto grad_oid = grad_store_.size() + num_forward_outputs_; + auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); + auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; + if (nullptr != shared_exec && grad_stype == kDefaultStorage && + shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) { + // try to reuse memory from shared_exec arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name)); - grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); - } // if (kNullOp == grad_req_types[arg_top]) - } else { // !has shared_exec - in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype); - in_arg_vec->back() = 0; - if (kNullOp == grad_req_types[arg_top]) { - arg_grad_vec->emplace_back(); } else { - arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top], - false, inferred_dtype); - arg_grad_vec->back() = 0; - grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); - } // if (kNullOp == grad_req_types[arg_top]) - } // if (has_shared_exec) + EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top], + inferred_dtype, arg_grad_vec); + } + grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); + } } else { // !shared_arg_names.count(arg_name) + // model parameter in_arg_vec->emplace_back(ReshapeOrCreate(arg_name, inferred_shape, inferred_dtype, - in_arg_ctxes[arg_top], shared_buffer)); + inferred_stype, in_arg_ctxes[arg_top], + shared_buffer)); + // gradient for model parameter if (kNullOp == grad_req_types[arg_top]) { arg_grad_vec->emplace_back(); } else { + auto grad_oid = grad_store_.size() + num_forward_outputs_; + auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); + auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape, - inferred_dtype, arg_grad_ctxes[arg_top], - shared_buffer)); + inferred_dtype, grad_stype, + arg_grad_ctxes[arg_top], shared_buffer)); grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back()); } // if (kNullOp == grad_req_types[arg_top]) } // if (shared_arg_names.count(arg_name)) @@ -718,14 +824,35 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol, Executor* shared_exec, const nnvm::NodeEntryMap& feed_dict) { const auto& idx = g.indexed_graph(); + // dispatch based on stype per operator + const auto& vstorage_type = g.GetAttr("storage_type"); + StorageTypeVector dispatch_stypes(idx.num_nodes(), kUndefinedStorage); + for (size_t nid = 0; nid < idx.num_nodes(); nid++) { + const auto& inode = idx[nid]; + auto num_outputs = inode.source->num_outputs(); + auto num_inputs = inode.inputs.size(); + StorageTypeVector vs(num_inputs + num_outputs, kUndefinedStorage); + for (size_t i = 0; i < num_inputs; i++) { + auto e = inode.inputs[i]; + vs[i] = vstorage_type[idx.entry_id(e)]; + CHECK_NE(vs[i], kUndefinedStorage); + } + for (uint32_t i = 0; i < num_outputs; ++i) { + uint32_t eid = idx.entry_id(nid, i); + vs[i + num_inputs] = vstorage_type[eid]; + } + bool contains_non_default = common::ContainsNonDefaultStorage(vs); + dispatch_stypes[nid] = contains_non_default ? kNonDefaultStorage : kDefaultStorage; + } + g.attrs["dispatch_stypes"] = std::make_shared(std::move(dispatch_stypes)); + + // data entries for output gradients for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { data_entry_[idx.entry_id(idx.outputs()[j])] = grad_store_[j - num_forward_outputs_].second; } { // memory allocator - const int kBadStorageID = -1; - const int kExternalStorageID = -2; nnvm::StorageVector arg_storage_id(idx.num_node_entries(), kBadStorageID); for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { arg_storage_id[idx.entry_id(idx.outputs()[j])] = kExternalStorageID; @@ -735,6 +862,9 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol, data_entry_[eid] = kv.second; arg_storage_id[eid] = kExternalStorageID; } + for (size_t i = 0; i < idx.num_node_entries(); i++) { + if (vstorage_type[i] != kDefaultStorage) arg_storage_id[i] = kDynamicStorageID; + } g.attrs["storage"] = std::make_shared(std::move(arg_storage_id)); g = nnvm::ApplyPass(g, "PlanMemory"); } @@ -792,6 +922,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& shared_arg_names, std::vector* in_arg_vec, @@ -811,6 +942,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol, const nnvm::IndexedGraph& idx = g.indexed_graph(); nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape()); nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1); + StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage); for (size_t i = 0; i < num_forward_inputs_; ++i) { const uint32_t nid = idx.input_nodes().at(i); const std::string& name = idx[nid].source->attrs.name; @@ -822,29 +954,41 @@ void GraphExecutor::Init(nnvm::Symbol symbol, if (arg_dtype_map.end() != it2) { arg_dtypes[i] = it2->second; } + auto it3 = arg_stype_map.find(name); + if (arg_stype_map.end() != it3) { + arg_stypes[i] = it3->second; + } } - g = nnvm::pass::InferShape(g, arg_shapes, "__shape__"); + g = InferShape(std::move(g), arg_shapes, "__shape__"); if (g.GetAttr("shape_num_unknown_nodes") != 0U) { HandleInferShapeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("shape")); } - g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__"); + g = InferType(std::move(g), arg_dtypes, "__dtype__"); if (g.GetAttr("dtype_num_unknown_nodes") != 0U) { HandleInferTypeError(num_forward_inputs_, g.indexed_graph(), g.GetAttr("dtype")); } + g = InferStorageType(std::move(g), arg_stypes, "__storage_type__"); + if (g.GetAttr("storage_type_num_unknown_nodes") != 0U) { + HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(), + g.GetAttr("storage_type")); + } + // Create in_args, arg_grads, and aux_states using // the inferred shapes and dtypes. if (nullptr == shared_buffer) { // regular simple bind InitArguments(idx, g.GetAttr("shape"), g.GetAttr("dtype"), + g.GetAttr("storage_type"), in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, grad_req_types, in_arg_vec, arg_grad_vec, aux_state_vec); } else { // simple bind using shared data arrays and shared_exec InitArguments(idx, g.GetAttr("shape"), g.GetAttr("dtype"), + g.GetAttr("storage_type"), in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, grad_req_types, shared_arg_names, shared_exec, shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec); @@ -905,20 +1049,29 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { const auto& vdtype = graph_.GetAttr("dtype"); const auto& vshape = graph_.GetAttr("shape"); const auto& vstorage = graph_.GetAttr("storage_id"); + const auto& vstorage_type = graph_.GetAttr("storage_type"); const auto& vctx = graph_.GetAttr("context"); CHECK_EQ(idx.num_node_entries(), vshape.size()); CHECK_EQ(idx.num_node_entries(), vdtype.size()); CHECK_EQ(idx.num_node_entries(), vstorage.size()); CHECK_EQ(data_entry_.size(), vshape.size()); std::vector data_context(idx.num_node_entries()); + std::vector data_storage_type(idx.num_node_entries(), kUndefinedStorage); for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) { - data_context[idx.entry_id(nid, i)] = vctx[nid]; + auto eid = idx.entry_id(nid, i); + data_context[eid] = vctx[nid]; + CHECK_NE(vstorage_type[nid], kUndefinedStorage); + data_storage_type[eid] = (NDArrayStorageType) vstorage_type[nid]; } } // information about the pool - using PoolEntry = std::pair; + struct PoolEntry { + Context ctx; + size_t bytes; + NDArrayStorageType stype; + }; std::vector pool_info; // assign array to head gradient @@ -926,26 +1079,36 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { uint32_t nid = idx.input_nodes().at(i); uint32_t oid = head_grad_map_.at(idx[nid].source); uint32_t eid = idx.entry_id(idx.outputs()[oid]); + NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid]; CHECK_NE(vshape[eid].ndim(), 0U); CHECK_NE(vdtype[eid], -1); - data_entry_[idx.entry_id(nid, 0)] = - NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); + auto data_eid = idx.entry_id(nid, 0); + // initialize based on storage_type + if (stype != kDefaultStorage) { + data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]); + } else { + data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); + } + if (log_verbose_) { + LOG(INFO) << "\tinit head_g entry\t" << data_eid << "\tas stype " << stype; + } } // get maximum bytes in each pool for (size_t i = 0; i < vshape.size(); ++i) { if (!data_entry_[i].is_none()) continue; size_t bytes = vshape[i].Size() * mshadow::mshadow_sizeof(vdtype[i]); int storage_id = vstorage[i]; + // skip pool allocation for kBadStorageID, kExternalStorageID and kDynamicStorageID if (storage_id < 0) continue; size_t sid = static_cast(storage_id); if (sid >= pool_info.size()) { - pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0)}); + pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage}); } PoolEntry& info = pool_info[sid]; - if (info.second == 0) { - info = PoolEntry{data_context[i], bytes}; + if (info.bytes == 0) { + info = PoolEntry{data_context[i], bytes, data_storage_type[i]}; } else { - info.second = std::max(info.second, bytes); + info.bytes = std::max(info.bytes, bytes); } } // construct the re-use pool, if needed @@ -966,13 +1129,14 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { sorted_pool_index.push_back(i); } auto pool_comparator = [&pool_info](int lhs, int rhs){ - return pool_info[lhs].second > pool_info[rhs].second; + return pool_info[lhs].bytes > pool_info[rhs].bytes; }; std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator); for (size_t i : sorted_pool_index) { - const Context& ctx = pool_info[i].first; - size_t bytes = pool_info[i].second; + const Context& ctx = pool_info[i].ctx; + size_t bytes = pool_info[i].bytes; + NDArrayStorageType storage_type = pool_info[i].stype; bool allocated = false; for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) { if (it->second.ctx() == ctx && it->first >= bytes) { @@ -987,7 +1151,9 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { CHECK_LE(nword, std::numeric_limits::max()); // allocate float arrays TShape shape{static_cast(nword)}; - NDArray nd(shape, ctx); + // TODO(junwu): adding delay_alloc=true to create nd + // is a temporary solution. + NDArray nd(shape, ctx, true); data_pool_[i] = nd; // put the new allocated arrays to shared pool if (shared_pool != nullptr) { @@ -997,15 +1163,22 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { } CHECK_EQ(data_pool_.size(), pool_info.size()); // assign the data entries - for (size_t i = 0; i < data_entry_.size(); ++i) { // avoid pre-allocated arrays if (!data_entry_[i].is_none()) continue; // assign allocated array by storage id int storage_id = vstorage[i]; - CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; - const NDArray& src = data_pool_.at(storage_id); - data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); + auto storage_type = (NDArrayStorageType) vstorage_type[i]; + if (storage_type == kDefaultStorage) { + CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; + const NDArray& src = data_pool_.at(storage_id); + data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); + } else { + data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]); + } + if (log_verbose_) { + LOG(INFO) << "\tinit data entry\t" << i << "\tas stype " << storage_type; + } } } @@ -1020,11 +1193,28 @@ void GraphExecutor::InitCachedOps() { const auto& vctx = graph_.GetAttr("context"); const auto& addto_entry = graph_.GetAttr >("addto_entry"); const auto& skip_plus_node = graph_.GetAttr >("skip_plus_node"); + const auto& vstorage_type = graph_.GetAttr("storage_type"); op_nodes_.resize(idx.num_nodes()); // setup the array and requirements. for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { const auto& inode = idx[nid]; + if (log_verbose_) { + if (inode.source->is_variable()) { + LOG(INFO) << "node " << nid << " var"; + } else { + LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name; + auto exec = op_execs[nid]; + for (const auto& e : inode.inputs) { + auto eid = idx.entry_id(e); + LOG(INFO) << "\t\tinput " << eid << " stype: " << vstorage_type[eid]; + } + for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { + uint32_t eid = idx.entry_id(nid, index); + LOG(INFO) << "\t\toutput " << eid << " stype: " << vstorage_type[eid]; + } + } + } if (inode.source->is_variable()) continue; #if MXNET_USE_PROFILER op_nodes_[nid].opr_name = inode.source->op()->name.c_str(); @@ -1104,7 +1294,7 @@ void GraphExecutor::InitCachedOps() { if (is_async) { exec->op_ctx.async_on_complete = on_complete; } - exec->Run(ctx); + exec->Run(ctx, is_gpu); // call on complete only if it is async op if (!is_async) { if (is_gpu) { @@ -1265,7 +1455,8 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { CHECK_EQ(opnode.exec->out_array.size(), 1U); CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0])); } else if (opnode.exec->exec_type() == ExecType::kLocal) { - opnode.exec->Run(RunContext{opnode.ctx, nullptr}); + bool is_gpu = opnode.ctx.dev_mask() == gpu::kDevMask; + opnode.exec->Run(RunContext{opnode.ctx, nullptr}, is_gpu); } else if (opnode.cached_opr != nullptr) { #if MXNET_USE_PROFILER bool profiling = engine::Profiler::Get()->GetState() == engine::Profiler::kRunning; @@ -1335,7 +1526,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, RunContext ctx, Engine::CallbackOnComplete on_complete) { // Run all opr in the sub-graph for (auto &exec : exec_list) { - exec->Run(ctx); + exec->Run(ctx, is_gpu); } if (is_gpu) { #if MXNET_USE_CUDA @@ -1370,6 +1561,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& shared_arg_names, std::vector* in_args, @@ -1380,7 +1572,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, auto exec = new exec::GraphExecutor(); exec->Init(symbol, default_ctx, group2ctx, in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes, - arg_shape_map, arg_dtype_map, + arg_shape_map, arg_dtype_map, arg_stype_map, grad_req_types, shared_arg_names, in_args, arg_grads, aux_states, shared_buffer, shared_exec); diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h index dc50bef002ab..48222f05fae2 100644 --- a/src/executor/graph_executor.h +++ b/src/executor/graph_executor.h @@ -59,6 +59,7 @@ class GraphExecutor : public Executor { friend class autograd::AutogradRuntime; using Executor::MonitorCallback; + GraphExecutor(); virtual ~GraphExecutor(); void Forward(bool is_train) override; void PartialForward(bool is_train, int step, int *step_left) override; @@ -96,6 +97,7 @@ class GraphExecutor : public Executor { const std::vector& aux_state_ctxes, const std::unordered_map& arg_shape_map, const std::unordered_map& arg_dtype_map, + const std::unordered_map& arg_stype_map, const std::vector& grad_req_types, const std::unordered_set& shared_arg_names, std::vector* in_arg_vec, @@ -141,6 +143,7 @@ class GraphExecutor : public Executor { void InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -153,6 +156,7 @@ class GraphExecutor : public Executor { void InitArguments(const nnvm::IndexedGraph& idx, const nnvm::ShapeVector& inferred_shapes, const nnvm::DTypeVector& inferred_dtypes, + const StorageTypeVector& inferred_stypes, const std::vector& in_arg_ctxes, const std::vector& arg_grad_ctxes, const std::vector& aux_state_ctxes, @@ -201,7 +205,8 @@ class GraphExecutor : public Executor { std::vector op_nodes_; // internal data entry of each node std::vector data_entry_; - // internal data pool of allocated entries + // internal data pool of allocated entries. + // these allocated entries can be used for static memory sharing between executors. std::vector data_pool_; // output arrays std::vector output_arrays_; @@ -233,6 +238,8 @@ class GraphExecutor : public Executor { bool prefer_bulk_execution_; // cached segment operator std::vector cached_seg_opr_; + // verbose logging + bool log_verbose_ = false; }; } // namespace exec diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc new file mode 100644 index 000000000000..144c3713e205 --- /dev/null +++ b/src/executor/infer_graph_attr_pass.cc @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file infer_graph_attr_pass.cc + * \brief infer graph shape, dtype, and storage type + */ + +#include +#include +#include "./exec_pass.h" + +namespace mxnet { +namespace exec { + +template +bool ApplyOpInferAttr(const nnvm::Graph& g, + const FInfer& finfer, + const NodeAttrs& attrs, + const uint32_t nid, + std::vector* in_attrs, + std::vector* out_attrs) { + return finfer(attrs, in_attrs, out_attrs); +} + +template<> +bool ApplyOpInferAttr(const nnvm::Graph& g, + const FInferStorageType& finfer, + const NodeAttrs& attrs, + const uint32_t nid, + std::vector* in_attrs, + std::vector* out_attrs) { + const ContextVector& ctxes = g.GetAttr("context"); + return finfer(attrs, ctxes[nid], in_attrs, out_attrs); +} + +/*!\brief + * This is a duplicate of the InferAttr function in nnvm with minor modification + * to support inferring storage type whose function signature is different from + * shape/type inference functions'. The nnvm InferAttr will be deprecated + * in the future. Please use interfaces InferShape, InferType, and InferStorageType + * to call this function. + */ +template +nnvm::Graph InferAttr(nnvm::Graph &&ret, + const AttrType empty_val, + const char* infer_name, + const char* input_name, + const char* attr_key_name, + const char* attr_name, + const char* unknown_name, + IsNone fis_none, + FDefault fdefault, + bool backward_identity_assign) { + using nnvm::IndexedGraph; + using nnvm::Op; + using AttrVector = std::vector; + using dmlc::any; + + const IndexedGraph& idx = ret.indexed_graph(); + static auto& finfer_shape = + Op::GetAttr(infer_name); + static auto& is_backward = + Op::GetAttr("TIsBackward"); + // gradient function, used to get node correspondence. + static auto& fgrad = + Op::GetAttr("FGradient"); + // reshape shape vector + AttrVector rshape; + if (ret.attrs.count(attr_name) != 0) { + rshape = ret.MoveCopyAttr(attr_name); + } else { + rshape.resize(idx.num_node_entries(), empty_val); + } + + if (ret.attrs.count(input_name) != 0) { + const AttrVector& shape_args = ret.GetAttr(input_name); + CHECK_LE(shape_args.size(), idx.input_nodes().size()) + << "More provided " << attr_name << "s than number of arguments."; + for (size_t i = 0; i < shape_args.size(); ++i) { + rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i]; + } + // erase the provided arguments + ret.attrs.erase(input_name); + } + + // get the shape hints + std::string shape_hints_key = std::string(attr_name) + "_hints"; + if (ret.attrs.count(shape_hints_key)) { + nnvm::NodeEntryMap shape_hints = + ret.GetAttr>(shape_hints_key); + for (const auto& kv : shape_hints) { + nnvm::NodeEntry e = kv.first; + if (idx.exist(e.node.get())) { + rshape[idx.entry_id(kv.first)] = kv.second; + } + } + } + + std::string shape_attr_key; + if (ret.attrs.count(attr_key_name) != 0) { + shape_attr_key = ret.GetAttr(attr_key_name); + // erase the provided arguments + ret.attrs.erase(attr_key_name); + } + // Temp space for shape inference. + std::vector ishape, oshape; + + // inference step function for nid + auto infer_step = [&](uint32_t nid, bool last_iter) { + const auto& inode = idx[nid]; + const uint32_t num_inputs = inode.inputs.size(); + const uint32_t num_outputs = inode.source->num_outputs(); + if (inode.source->is_variable()) { + // Variable node. No operator. Only one output entry. + CHECK(inode.source->op() == nullptr); + CHECK_EQ(num_outputs, 1U); + const uint32_t out_ent_id = idx.entry_id(nid, 0); + if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) { + auto it = inode.source->attrs.dict.find(shape_attr_key); + if (it != inode.source->attrs.dict.end()) { + std::istringstream is(it->second); + CHECK(is >> rshape[out_ent_id]) << "Invalid attribute"; + } + } + } else if (is_backward.get(inode.source->op(), false) && + inode.control_deps.size() && backward_identity_assign) { + CHECK_GE(inode.control_deps.size(), 1U) + << "BackwardOp need to have control_deps to its forward op"; + const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; + nnvm::NodePtr fwd_ptr = inode.source->control_deps[0]; + CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; + // use gradient function to find out the correspondence. + std::vector ograd(fwd_ptr->num_outputs()); + for (size_t i = 0; i < ograd.size(); ++i) { + ograd[i].index = static_cast(i); + } + // input gradient list + auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); + const nnvm::Node* igrad_node = nullptr; + // Input gradient assignement + for (size_t i = 0; i < igrad.size(); ++i) { + if (igrad[i].node->op() == inode.source->op()) { + uint32_t eid = idx.entry_id(nid, igrad[i].index); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; + } else { + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + << "Backward shape inconsistent with the forward shape"; + } + if (igrad_node == nullptr) { + igrad_node = igrad[i].node.get(); + } else { + CHECK(igrad_node == igrad[i].node.get()); + } + } + } + // out grad entries + CHECK(igrad_node != nullptr) + << "Cannot find matching backward op for " << inode.source->attrs.name; + for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { + const nnvm::NodeEntry& e = igrad_node->inputs[i]; + if (e.node == nullptr) { + uint32_t eid = idx.entry_id(inode.inputs[i]); + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; + } + } + } + } else { + bool forward_known = true; + // Forward operator inference. + ishape.resize(num_inputs, empty_val); + for (uint32_t i = 0; i < ishape.size(); ++i) { + ishape[i] = rshape[idx.entry_id(inode.inputs[i])]; + if (fis_none(ishape[i])) forward_known = false; + } + oshape.resize(num_outputs, empty_val); + for (uint32_t i = 0; i < oshape.size(); ++i) { + oshape[i] = rshape[idx.entry_id(nid, i)]; + if (fis_none(oshape[i])) forward_known = false; + } + auto finfer = finfer_shape.get(inode.source->op(), fdefault); + if (!forward_known) { + if (finfer != nullptr) { + // Call inference function of the operator. + try { + forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, + nid, &ishape, &oshape); + } catch (const std::exception& e) { + throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); + } + } else { + CHECK(!last_iter) + << "Attribute " << infer_name + << " is not registed by op " << inode.source->op()->name + << " we are not able to complete the inference because of this"; + } + } + // Save to the result map. + for (uint32_t i = 0; i < num_inputs; ++i) { + rshape[idx.entry_id(inode.inputs[i])] = ishape[i]; + } + for (uint32_t i = 0; i < num_outputs; ++i) { + rshape[idx.entry_id(nid, i)] = oshape[i]; + } + } + }; + + size_t last_num_unknown; + size_t num_unknown = rshape.size(); + int i = 0; + do { + if (i % 2 == 0) { + for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { + infer_step(nid, false); + } + } else { + // backward inference + for (uint32_t i = idx.num_nodes(); i != 0; --i) { + infer_step(i - 1, false); + } + } + last_num_unknown = num_unknown; + num_unknown = 0; + for (size_t j = 0; j < idx.num_node_entries(); ++j) { + if (fis_none(rshape[j])) { + ++num_unknown; + } + } + ++i; + } while (num_unknown > 0 && last_num_unknown > num_unknown); + // set the shapes + ret.attrs[attr_name] = std::make_shared(std::move(rshape)); + // number of nodes who knows the shape. + ret.attrs[unknown_name] = std::make_shared(num_unknown); + return ret; +} + +// inference fucntion for same type +inline bool SameType(const nnvm::NodeAttrs& attrs, + std::vector *iattr, + std::vector *oattr) { + int def_v = -1; + for (int v : *oattr) { + if (v != -1) { + def_v = v; break; + } + } + if (def_v == -1) { + for (int v : *iattr) { + if (v != -1) { + def_v = v; break; + } + } + } + if (def_v == -1) return false; + for (int& v : *oattr) { + v = def_v; + } + for (int& v : *iattr) { + v = def_v; + } + return true; +} + +// assigning default type N to both input and output attrs with value -1 +template +inline bool DefaultType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *iattr, + std::vector *oattr) { + // TODO(junwu): check whether need to use ctx + for (int& v : *oattr) { + if (v == none) v = default_val; + } + for (int& v : *iattr) { + if (v == none) v = default_val; + } + return true; +} + +nnvm::Graph InferShape(nnvm::Graph graph, + nnvm::ShapeVector shape_inputs, + const std::string& shape_attr_key) { + using dmlc::any; + if (shape_inputs.size() != 0) { + graph.attrs["shape_inputs"] = std::make_shared(std::move(shape_inputs)); + } + if (shape_attr_key.length() != 0) { + graph.attrs["shape_attr_key"] = std::make_shared(std::move(shape_attr_key)); + } + return InferAttr( + std::move(graph), nnvm::TShape(), + "FInferShape", "shape_inputs", "shape_attr_key", + "shape", "shape_num_unknown_nodes", + [](const nnvm::TShape& s) { return s.ndim() == 0 || s.Size() == 0; }, + nullptr, true); +} + +nnvm::Graph InferType(nnvm::Graph graph, + nnvm::DTypeVector dtype_inputs, + const std::string& dtype_attr_key) { + using dmlc::any; + if (dtype_inputs.size() != 0) { + graph.attrs["dtype_inputs"] = std::make_shared(std::move(dtype_inputs)); + } + if (dtype_attr_key.length() != 0) { + graph.attrs["dtype_attr_key"] = std::make_shared(std::move(dtype_attr_key)); + } + return InferAttr( + std::move(graph), -1, + "FInferType", "dtype_inputs", "dtype_attr_key", + "dtype", "dtype_num_unknown_nodes", + [](const int t) { return t == -1; }, + SameType, true); +} + +nnvm::Graph InferStorageType(nnvm::Graph graph, + StorageTypeVector storage_type_inputs, + const std::string& storage_type_attr_key) { + using dmlc::any; + if (storage_type_inputs.size() != 0) { + graph.attrs["storage_type_inputs"] = std::make_shared(std::move(storage_type_inputs)); + } + if (storage_type_attr_key.length() != 0) { + graph.attrs["storage_type_attr_key"] = std::make_shared(std::move(storage_type_attr_key)); + } + // for storage type, the backward attr is not necessarily the same as it's correspondence + const int kDefaultStorage = 0; + return InferAttr( + std::move(graph), -1, + "FInferStorageType", "storage_type_inputs", "storage_type_attr_key", + "storage_type", "storage_type_num_unknown_nodes", + [](const int t) { return t == -1; }, + DefaultType, false); +} + +} // namespace exec +} // namespace mxnet diff --git a/src/executor/inplace_addto_detect_pass.cc b/src/executor/inplace_addto_detect_pass.cc index 26a91e3f1b5e..9359d8863594 100644 --- a/src/executor/inplace_addto_detect_pass.cc +++ b/src/executor/inplace_addto_detect_pass.cc @@ -62,6 +62,8 @@ Graph DetectInplaceAddTo(Graph g) { uint32_t eid_rhs = idx.entry_id(inode.inputs[1]); if (ref_count[eid_rhs] != 1) continue; if (inode.inputs[0].node_id >= inode.inputs[1].node_id) continue; + // TODO(haibin) support inplace addto for Dynamic Storage + if (storage_id[eid_rhs] == kDynamicStorageID) continue; CHECK_NE(storage_id[eid_rhs], sid); storage_id[eid_rhs] = sid; addto_entry[eid_rhs] = 1; diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h index c5ec10618080..ade7c1a53bd2 100644 --- a/src/io/iter_batchloader.h +++ b/src/io/iter_batchloader.h @@ -41,7 +41,7 @@ namespace io { class BatchLoader : public IIterator { public: explicit BatchLoader(IIterator *base): - base_(base), head_(1), num_overflow_(0) { + head_(1), num_overflow_(0), base_(base) { } virtual ~BatchLoader(void) { @@ -52,7 +52,7 @@ class BatchLoader : public IIterator { std::vector > kwargs_left; // init batch param, it could have similar param with kwargs_left = param_.InitAllowUnknown(kwargs); - // Init space for out_ + // Init space for out out_.inst_index = new unsigned[param_.batch_size]; out_.batch_size = param_.batch_size; out_.data.clear(); @@ -69,6 +69,7 @@ class BatchLoader : public IIterator { } head_ = 1; } + virtual bool Next(void) { out_.num_batch_padd = 0; out_.batch_size = param_.batch_size; @@ -128,23 +129,25 @@ class BatchLoader : public IIterator { return out_; } - private: + protected: /*! \brief batch parameters */ BatchParam param_; /*! \brief output data */ TBlobBatch out_; - /*! \brief base iterator */ - IIterator *base_; /*! \brief on first */ int head_; /*! \brief number of overflow instances that readed in round_batch mode */ int num_overflow_; + /*! \brief tensor to hold data */ + std::vector data_; + + private: + /*! \brief base iterator */ + IIterator *base_; /*! \brief data shape */ std::vector shape_; /*! \brief unit size */ std::vector unit_size_; - /*! \brief tensor to hold data */ - std::vector data_; // initialize the data holder by using from the first batch. inline void InitData(const DataInst& first_batch) { shape_.resize(first_batch.data.size()); diff --git a/src/io/iter_libsvm.cc b/src/io/iter_libsvm.cc new file mode 100644 index 000000000000..803d19e74481 --- /dev/null +++ b/src/io/iter_libsvm.cc @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_libsvm.cc + * \brief define a LibSVM Reader to read in arrays + */ +#include +#include +#include +#include +#include +#include "./iter_sparse_prefetcher.h" +#include "./iter_sparse_batchloader.h" + +namespace mxnet { +namespace io { +// LibSVM parameters +struct LibSVMIterParam : public dmlc::Parameter { + /*! \brief path to data libsvm file */ + std::string data_libsvm; + /*! \brief data shape */ + TShape data_shape; + /*! \brief path to label libsvm file */ + std::string label_libsvm; + /*! \brief label shape */ + TShape label_shape; + /*! \brief partition the data into multiple parts */ + int num_parts; + /*! \brief the index of the part will read*/ + int part_index; + // declare parameters + DMLC_DECLARE_PARAMETER(LibSVMIterParam) { + DMLC_DECLARE_FIELD(data_libsvm) + .describe("The input LibSVM file or a directory path."); + DMLC_DECLARE_FIELD(data_shape) + .describe("The shape of one example."); + DMLC_DECLARE_FIELD(label_libsvm).set_default("NULL") + .describe("The input LibSVM file or a directory path. " + "If NULL, all labels will be read from ``data_libsvm``."); + index_t shape1[] = {1}; + DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1)) + .describe("The shape of one label."); + DMLC_DECLARE_FIELD(num_parts).set_default(1) + .describe("partition the data into multiple parts"); + DMLC_DECLARE_FIELD(part_index).set_default(0) + .describe("the index of the part will read"); + } +}; + +class LibSVMIter: public SparseIIterator { + public: + LibSVMIter() {} + virtual ~LibSVMIter() {} + + // intialize iterator loads data in + virtual void Init(const std::vector >& kwargs) { + param_.InitAllowUnknown(kwargs); + CHECK_EQ(param_.data_shape.ndim(), 1) << "dimension of data_shape is expected to be 1"; + CHECK_GT(param_.num_parts, 0) << "number of parts should be positive"; + CHECK_GE(param_.part_index, 0) << "part index should be non-negative"; + data_parser_.reset(dmlc::Parser::Create(param_.data_libsvm.c_str(), + param_.part_index, + param_.num_parts, "libsvm")); + if (param_.label_libsvm != "NULL") { + label_parser_.reset(dmlc::Parser::Create(param_.label_libsvm.c_str(), + param_.part_index, + param_.num_parts, "libsvm")); + CHECK_GT(param_.label_shape.Size(), 1) + << "label_shape is not expected to be (1,) when param_.label_libsvm is set."; + } else { + CHECK_EQ(param_.label_shape.Size(), 1) + << "label_shape is expected to be (1,) when param_.label_libsvm is NULL"; + } + // both data and label are of CSRStorage in libsvm format + if (param_.label_shape.Size() > 1) { + out_.data.resize(6); + } else { + // only data is of CSRStorage in libsvm format. + out_.data.resize(4); + } + } + + virtual void BeforeFirst() { + data_parser_->BeforeFirst(); + if (label_parser_.get() != nullptr) { + label_parser_->BeforeFirst(); + } + data_ptr_ = label_ptr_ = 0; + data_size_ = label_size_ = 0; + inst_counter_ = 0; + end_ = false; + } + + virtual bool Next() { + if (end_) return false; + while (data_ptr_ >= data_size_) { + if (!data_parser_->Next()) { + end_ = true; return false; + } + data_ptr_ = 0; + data_size_ = data_parser_->Value().size; + } + out_.index = inst_counter_++; + CHECK_LT(data_ptr_, data_size_); + const auto data_row = data_parser_->Value()[data_ptr_++]; + // data, indices and indptr + out_.data[0] = AsDataBlob(data_row); + out_.data[1] = AsIdxBlob(data_row); + out_.data[2] = AsIndPtrPlaceholder(data_row); + + if (label_parser_.get() != nullptr) { + while (label_ptr_ >= label_size_) { + CHECK(label_parser_->Next()) + << "Data LibSVM's row is smaller than the number of rows in label_libsvm"; + label_ptr_ = 0; + label_size_ = label_parser_->Value().size; + } + CHECK_LT(label_ptr_, label_size_); + const auto label_row = label_parser_->Value()[label_ptr_++]; + // data, indices and indptr + out_.data[3] = AsDataBlob(label_row); + out_.data[4] = AsIdxBlob(label_row); + out_.data[5] = AsIndPtrPlaceholder(label_row); + } else { + out_.data[3] = AsScalarLabelBlob(data_row); + } + return true; + } + + virtual const DataInst &Value(void) const { + return out_; + } + + virtual const NDArrayStorageType GetStorageType(bool is_data) const { + if (is_data) return kCSRStorage; + return param_.label_shape.Size() > 1 ? kCSRStorage : kDefaultStorage; + } + + virtual const TShape GetShape(bool is_data) const { + if (is_data) return param_.data_shape; + return param_.label_shape; + } + + private: + inline TBlob AsDataBlob(const dmlc::Row& row) { + const real_t* ptr = row.value; + TShape shape(mshadow::Shape1(row.length)); + return TBlob((real_t*) ptr, shape, cpu::kDevMask); // NOLINT(*) + } + + inline TBlob AsIdxBlob(const dmlc::Row& row) { + const uint64_t* ptr = row.index; + TShape shape(mshadow::Shape1(row.length)); + return TBlob((int64_t*) ptr, shape, cpu::kDevMask, mshadow::kInt64); // NOLINT(*) + } + + inline TBlob AsIndPtrPlaceholder(const dmlc::Row& row) { + return TBlob(nullptr, mshadow::Shape1(0), cpu::kDevMask, mshadow::kInt64); + } + + inline TBlob AsScalarLabelBlob(const dmlc::Row& row) { + const real_t* ptr = row.label; + return TBlob((real_t*) ptr, mshadow::Shape1(1), cpu::kDevMask); // NOLINT(*) + } + + LibSVMIterParam param_; + // output instance + DataInst out_; + // internal instance counter + unsigned inst_counter_{0}; + // at end + bool end_{false}; + // label parser + size_t label_ptr_{0}, label_size_{0}; + size_t data_ptr_{0}, data_size_{0}; + std::unique_ptr > label_parser_; + std::unique_ptr > data_parser_; +}; + + +DMLC_REGISTER_PARAMETER(LibSVMIterParam); + +MXNET_REGISTER_IO_ITER(LibSVMIter) +.describe(R"code(Returns the LibSVM file iterator. This iterator is experimental and +should be used with care. + +The input data is similar to libsvm file format, except that the indices are expected to be +zero-based instead of one-based. Details of the libsvm format are available at +`https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/` + +In this function, the `data_shape` parameter is used to set the shape of each line of the data. +The dimension of both `data_shape` and `label_shape` are expected to be 1. + +When `label_libsvm` is set to ``NULL``, both data and label are read from the same file specified +by `data_libsvm`. Otherwise, data is read from `data_libsvm` and label from `label_libsvm`, +in this case, if `data_libsvm` contains label, it will ignored. + +The `LibSVMIter` only support `round_batch` parameter set to ``True`` for now. So, if `batch_size` +is 3 and there are 4 total rows in libsvm file, 2 more examples +are consumed at the first round. If `reset` function is called after first round, +the call is ignored and remaining examples are returned in the second round. + +If ``data_libsvm = 'data/'`` is set, then all the files in this directory will be read. + +Examples:: + + // Contents of libsvm file ``data.t``. + 1.0 0:0.5 2:1.2 + -2.0 + -3.0 0:0.6 1:2.4 2:1.2 + 4 2:-1.2 + + // Creates a `LibSVMIter` with `batch_size`=3. + LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), + batch_size = 3) + + // The first batch (data and label) + [[ 0.5 0. 1.2 ] + [ 0. 0. 0. ] + [ 0.6 2.4 1.2 ]] + + [ 1. -2. -3.] + + // The second batch (data and label) + [[ 0. 0. -1.2 ] + [ 0.5 0. 1.2 ] + [ 0. 0. 0. ]] + + [ 4. 1. -2.] + + // Contents of libsvm file ``label.t`` + 1.0 + -2.0 0:0.125 + -3.0 2:1.2 + 4 1:1.0 2:-1.2 + + // Creates a `LibSVMIter` with specified label file + LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), + label_libsvm = 'label.t', label_shape = (3,), batch_size = 3) + + // Two batches of data read from the above iterator are as follows(data and label): + // The first batch + [[ 0.5 0. 1.2 ] + [ 0. 0. 0. ] + [ 0.6 2.4 1.2 ]] + + [[ 0. 0. 0. ] + [ 0.125 0. 0. ] + [ 0. 0. 1.2 ]] + + // The second batch + [[ 0. 0. -1.2 ] + [ 0.5 0. 1.2 ] + [ 0. 0. 0. ]] + + [[ 0. 1. -1.2 ] + [ 0. 0. 0. ] + [ 0.125 0. 0. ]] + +)code" ADD_FILELINE) +.add_arguments(LibSVMIterParam::__FIELDS__()) +.add_arguments(BatchParam::__FIELDS__()) +.add_arguments(PrefetcherParam::__FIELDS__()) +.set_body([]() { + return new SparsePrefetcherIter( + new SparseBatchLoader( + new LibSVMIter())); + }); + +} // namespace io +} // namespace mxnet diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h index 89960c71a12f..a743b5132821 100644 --- a/src/io/iter_prefetcher.h +++ b/src/io/iter_prefetcher.h @@ -46,8 +46,7 @@ namespace io { class PrefetcherIter : public IIterator { public: explicit PrefetcherIter(IIterator* base) - : loader_(base), out_(nullptr) { - } + : loader_(base), out_(nullptr) {} ~PrefetcherIter() { while (recycle_queue_.size() != 0) { @@ -56,21 +55,24 @@ class PrefetcherIter : public IIterator { delete batch; } delete out_; - iter_.Destroy(); + iter.Destroy(); } - virtual void Init(const std::vector >& kwargs) { + void InitParams(const std::vector >& kwargs) { std::vector > kwargs_left; // init image rec param kwargs_left = param_.InitAllowUnknown(kwargs); - // use the kwarg to init batch loader - loader_->Init(kwargs); // maximum prefetch threaded iter internal size const int kMaxPrefetchBuffer = 16; // init thread iter - iter_.set_max_capacity(kMaxPrefetchBuffer); + iter.set_max_capacity(kMaxPrefetchBuffer); + } - iter_.Init([this](DataBatch **dptr) { + virtual void Init(const std::vector >& kwargs) { + InitParams(kwargs); + // use the kwarg to init batch loader + loader_->Init(kwargs); + iter.Init([this](DataBatch **dptr) { if (!loader_->Next()) return false; const TBlobBatch& batch = loader_->Value(); if (*dptr == nullptr) { @@ -109,7 +111,7 @@ class PrefetcherIter : public IIterator { } virtual void BeforeFirst(void) { - iter_.BeforeFirst(); + iter.BeforeFirst(); } virtual bool Next(void) { @@ -124,9 +126,9 @@ class PrefetcherIter : public IIterator { arr.WaitToWrite(); } recycle_queue_.pop(); - iter_.Recycle(&old_batch); + iter.Recycle(&old_batch); } - return iter_.Next(&out_); + return iter.Next(&out_); } virtual const DataBatch &Value(void) const { return *out_; @@ -135,16 +137,16 @@ class PrefetcherIter : public IIterator { protected: /*! \brief prefetcher parameters */ PrefetcherParam param_; - /*! \brief internal batch loader */ - std::unique_ptr > loader_; + /*! \brief backend thread */ + dmlc::ThreadedIter iter; private: + /*! \brief internal batch loader */ + std::unique_ptr > loader_; /*! \brief output data */ DataBatch *out_; /*! \brief queue to be recycled */ std::queue recycle_queue_; - /*! \brief backend thread */ - dmlc::ThreadedIter iter_; }; } // namespace io } // namespace mxnet diff --git a/src/io/iter_sparse.h b/src/io/iter_sparse.h new file mode 100644 index 000000000000..beaf5c682998 --- /dev/null +++ b/src/io/iter_sparse.h @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_sparse.h + * \brief mxnet sparse data iterator + */ +#ifndef MXNET_IO_ITER_SPARSE_H_ +#define MXNET_IO_ITER_SPARSE_H_ + +#include +#include + +namespace mxnet { +/*! + * \brief iterator type + * \param DType data type + */ +template +class SparseIIterator : public IIterator { + public: + /*! \brief storage type of the data or label */ + virtual const NDArrayStorageType GetStorageType(bool is_data) const = 0; + /*! \brief shape of the data or label */ + virtual const TShape GetShape(bool is_data) const = 0; +}; // class SparseIIterator + +} // namespace mxnet +#endif // MXNET_IO_ITER_SPARSE_H_ diff --git a/src/io/iter_sparse_batchloader.h b/src/io/iter_sparse_batchloader.h new file mode 100644 index 000000000000..d5c9bd2f4578 --- /dev/null +++ b/src/io/iter_sparse_batchloader.h @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_sparse_batchloader.h + * \brief define a batch adapter to create sparse tblob batch + */ +#ifndef MXNET_IO_ITER_SPARSE_BATCHLOADER_H_ +#define MXNET_IO_ITER_SPARSE_BATCHLOADER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "./inst_vector.h" +#include "./image_iter_common.h" +#include "./iter_batchloader.h" +#include "./iter_sparse.h" + +namespace mxnet { +namespace io { + +/*! \brief create a batch iterator from single instance iterator */ +class SparseBatchLoader : public BatchLoader, public SparseIIterator { + public: + explicit SparseBatchLoader(SparseIIterator *base): + BatchLoader(base), sparse_base_(base) { + } + + virtual ~SparseBatchLoader(void) {} + + inline void Init(const std::vector >& kwargs) { + BatchLoader::Init(kwargs); + data_stype_ = sparse_base_->GetStorageType(true); + label_stype_ = sparse_base_->GetStorageType(false); + if (param_.round_batch == 0) { + LOG(FATAL) << "sparse batch loader doesn't support round_batch == false yet"; + } + } + + virtual void BeforeFirst(void) { + BatchLoader::BeforeFirst(); + } + + virtual bool Next(void) { + out_.num_batch_padd = 0; + out_.batch_size = param_.batch_size; + this->head_ = 0; + // if overflown from previous round, directly return false, until before first is called + if (num_overflow_ != 0) return false; + index_t top = 0; + inst_cache_.clear(); + while (sparse_base_->Next()) { + inst_cache_.emplace_back(sparse_base_->Value()); + if (inst_cache_.size() >= param_.batch_size) break; + } + // no more data instance + if (inst_cache_.size() == 0) { + return false; + } + if (inst_cache_.size() < param_.batch_size) { + CHECK_GT(param_.round_batch, 0); + num_overflow_ = 0; + sparse_base_->BeforeFirst(); + for (; inst_cache_.size() < param_.batch_size; ++num_overflow_) { + CHECK(sparse_base_->Next()) << "number of input must be bigger than batch size"; + inst_cache_.emplace_back(sparse_base_->Value()); + } + } + out_.num_batch_padd = num_overflow_; + CHECK_EQ(inst_cache_.size(), param_.batch_size); + this->InitDataFromBatch(); + for (size_t j = 0; j < inst_cache_.size(); j++) { + const auto& d = inst_cache_[j]; + out_.inst_index[top] = d.index; + // TODO(haibin) double check the type? + int64_t unit_size = 0; + for (size_t i = 0; i < d.data.size(); ++i) { + // indptr tensor + if (IsIndPtr(i)) { + auto indptr = data_[i].get(); + if (j == 0) indptr[0] = 0; + indptr[j + 1] = indptr[j] + unit_size; + offsets_[i] = j; + } else { + // indices and values tensor + unit_size = d.data[i].shape_.Size(); + MSHADOW_TYPE_SWITCH(data_[i].type_flag_, DType, { + const auto begin = offsets_[i]; + const auto end = offsets_[i] + unit_size; + mshadow::Copy(data_[i].get().Slice(begin, end), + d.data[i].get_with_shape(mshadow::Shape1(unit_size))); + }); + offsets_[i] += unit_size; + } + } + } + return true; + } + + virtual const TBlobBatch &Value(void) const { + return BatchLoader::Value(); + } + + virtual const NDArrayStorageType GetStorageType(bool is_data) const { + return sparse_base_->GetStorageType(is_data); + } + + virtual const TShape GetShape(bool is_data) const { + TShape inst_shape = sparse_base_->GetShape(is_data); + std::vector shape_vec; + shape_vec.push_back(param_.batch_size); + for (index_t dim = 0; dim < inst_shape.ndim(); ++dim) { + shape_vec.push_back(inst_shape[dim]); + } + return TShape(shape_vec.begin(), shape_vec.end()); + } + + private: + /*! \brief base sparse iterator */ + SparseIIterator *sparse_base_; + /*! \brief data instances */ + std::vector inst_cache_; + /*! \brief data storage type */ + NDArrayStorageType data_stype_; + /*! \brief data label type */ + NDArrayStorageType label_stype_; + /*! \brief tensor offset for slicing */ + std::vector offsets_; + + // check whether ith position is the indptr tensor for a CSR tensor + inline bool IsIndPtr(size_t i) { + auto data_num_aux = num_aux_data(data_stype_); + auto label_num_aux = num_aux_data(label_stype_); + auto label_indptr_offset = data_num_aux + 1 + label_num_aux; + // data indptr + if (i == data_num_aux && data_stype_ == kCSRStorage) { + return true; + } + // label indptr + if (i == label_indptr_offset && label_stype_ == kCSRStorage && data_stype_ == kCSRStorage) { + return true; + } + return false; + } + + // initialize the data holder by using from the batch + inline void InitDataFromBatch() { + CHECK(data_stype_ == kCSRStorage || label_stype_ == kCSRStorage); + CHECK_GT(inst_cache_.size(), 0); + out_.data.clear(); + data_.clear(); + offsets_.clear(); + + size_t total_size = inst_cache_[0].data.size(); + data_.resize(total_size); + offsets_.resize(total_size, 0); + std::vector vec_sizes(total_size, 0); + // accumulate the memory required for a batch + for (size_t i = 0; i < total_size; ++i) { + size_t size = 0; + // vec_size for indptr + if (IsIndPtr(i)) { + size = param_.batch_size + 1; + } else { + for (const auto &d : inst_cache_) size += d.data[i].shape_.Size(); + } + vec_sizes[i] = size; + } + + CHECK_EQ(vec_sizes[0], vec_sizes[1]); + for (size_t i = 0; i < total_size; ++i) { + int src_type_flag = inst_cache_[0].data[i].type_flag_; + // init object attributes + TShape dst_shape(mshadow::Shape1(vec_sizes[i])); + data_[i].resize(mshadow::Shape1(vec_sizes[i]), src_type_flag); + CHECK(data_[i].dptr_ != nullptr); + out_.data.push_back(TBlob(data_[i].dptr_, dst_shape, cpu::kDevMask, src_type_flag)); + } + } +}; // class BatchLoader +} // namespace io +} // namespace mxnet +#endif // MXNET_IO_ITER_SPARSE_BATCHLOADER_H_ diff --git a/src/io/iter_sparse_prefetcher.h b/src/io/iter_sparse_prefetcher.h new file mode 100644 index 000000000000..3908f9bd3826 --- /dev/null +++ b/src/io/iter_sparse_prefetcher.h @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file iter_sparse_prefetcher.h + * \brief define a prefetcher using threaditer to keep k batch fetched + */ +#ifndef MXNET_IO_ITER_SPARSE_PREFETCHER_H_ +#define MXNET_IO_ITER_SPARSE_PREFETCHER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "./inst_vector.h" +#include "./image_iter_common.h" +#include "./iter_prefetcher.h" +#include "./iter_sparse.h" + +namespace mxnet { +namespace io { +// iterator on sparse data +class SparsePrefetcherIter : public PrefetcherIter { + public: + explicit SparsePrefetcherIter(SparseIIterator* base) + : PrefetcherIter(base), sparse_loader_(base) {} + + ~SparsePrefetcherIter() {} + + virtual void Init(const std::vector >& kwargs) { + PrefetcherIter::InitParams(kwargs); + // use the kwarg to init batch loader + sparse_loader_->Init(kwargs); + iter.Init([this](DataBatch **dptr) { + if (!sparse_loader_->Next()) return false; + const TBlobBatch& batch = sparse_loader_->Value(); + if (*dptr == nullptr) { + // allocate databatch + *dptr = new DataBatch(); + (*dptr)->num_batch_padd = batch.num_batch_padd; + // (*dptr)->data.at(0) => data + // (*dptr)->data.at(1) => label + (*dptr)->data.resize(2); + (*dptr)->index.resize(batch.batch_size); + size_t data_iter = 0; + for (size_t i = 0; i < (*dptr)->data.size(); ++i) { + bool is_data = i == 0; + auto stype = this->GetStorageType(is_data); + auto dtype = param_.dtype ? param_.dtype.value() : batch.data[data_iter].type_flag_; + if (stype == kDefaultStorage) { + (*dptr)->data.at(i) = NDArray(batch.data[data_iter].shape_, + Context::CPU(), false, dtype); + } else { + (*dptr)->data.at(i) = NDArray(stype, this->GetShape(is_data), + Context::CPU(), false, dtype); + } + data_iter += num_aux_data(stype) + 1; + } + } + // copy data over + size_t data_iter = 0; + for (size_t i = 0; i < (*dptr)->data.size(); ++i) { + auto& nd = ((*dptr)->data)[i]; + auto stype = nd.storage_type(); + auto& data_i = ((*dptr)->data)[i]; + if (stype == kDefaultStorage) { + CopyFromTo(data_i.data(), batch.data[data_iter]); + } else if (stype == kCSRStorage) { + auto& values = batch.data[data_iter]; + auto& indices = batch.data[data_iter + 1]; + auto& indptr = batch.data[data_iter + 2]; + // allocate memory + CHECK_EQ(indices.shape_.Size(), values.shape_.Size()); + nd.CheckAndAllocAuxData(csr::kIdx, indices.shape_); + nd.CheckAndAllocData(values.shape_); + nd.CheckAndAllocAuxData(csr::kIndPtr, indptr.shape_); + // copy values, indices and indptr + CopyFromTo(data_i.data(), values); + CopyFromTo(data_i.aux_data(csr::kIdx), indices); + CopyFromTo(data_i.aux_data(csr::kIndPtr), indptr); + } else { + LOG(FATAL) << "Storage type not implemented: " << stype; + } + data_iter += num_aux_data(stype) + 1; + (*dptr)->num_batch_padd = batch.num_batch_padd; + } + if (batch.inst_index) { + std::copy(batch.inst_index, + batch.inst_index + batch.batch_size, + (*dptr)->index.begin()); + } + return true; + }, + [this]() { sparse_loader_->BeforeFirst(); }); + } + + virtual void BeforeFirst(void) { + PrefetcherIter::BeforeFirst(); + } + + virtual bool Next(void) { + return PrefetcherIter::Next(); + } + virtual const DataBatch &Value(void) const { + return PrefetcherIter::Value(); + } + + virtual const NDArrayStorageType GetStorageType(bool is_data) const { + return sparse_loader_->GetStorageType(is_data); + } + + virtual const TShape GetShape(bool is_data) const { + return sparse_loader_->GetShape(is_data); + } + + private: + /*! \brief internal sparse batch loader */ + SparseIIterator* sparse_loader_; + + inline void CopyFromTo(TBlob dst, const TBlob src) { + MSHADOW_TYPE_SWITCH(src.type_flag_, DType, { + mshadow::Copy(dst.FlatTo1D(), src.FlatTo1D()); + }); + } +}; +} // namespace io +} // namespace mxnet +#endif // MXNET_IO_ITER_SPARSE_PREFETCHER_H_ diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index ade9c95feda7..cd0d3ab02825 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -21,13 +21,17 @@ */ #ifndef MXNET_KVSTORE_COMM_H_ #define MXNET_KVSTORE_COMM_H_ +#include #include #include #include #include #include #include +#include #include "mxnet/ndarray.h" +#include "../ndarray/ndarray_function.h" +#include "../operator/tensor/sparse_retain-inl.h" namespace mxnet { namespace kvstore { /** @@ -40,9 +44,10 @@ class Comm { } virtual ~Comm() { } /** - * \brief init key with the data shape + * \brief init key with the data shape and storage shape */ - virtual void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) = 0; + virtual void Init(int key, const NDArrayStorageType stype, + const TShape& shape, int dtype = mshadow::kFloat32) = 0; /** * \brief returns src[0] + .. + src[src.size()-1] */ @@ -55,6 +60,18 @@ class Comm { int key, const NDArray& src, const std::vector dst, int priority) = 0; + /** + * \brief broadcast src to dst[i] with target row_ids for every i + * \param dst a list of destination row_sparse NDArray and its target row_ids to broadcast, + where the row_ids are expected to be unique and sorted + * \param use_copy if set to true, directly copy src to dst[i] without looking up the + provided row_ids + */ + virtual void BroadcastRowSparse(int key, const NDArray& src, + const std::vector>& dst, + const bool use_copy, + const int priority) = 0; + /** * \brief return a pinned contex */ @@ -75,43 +92,85 @@ class CommCPU : public Comm { CommCPU() { nthread_reduction_ = dmlc::GetEnv("MXNET_KVSTORE_REDUCTION_NTHREADS", 4); bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); + // TODO(junwu) delete the following data member, now for benchmark only + is_serial_push_ = dmlc::GetEnv("MXNET_KVSTORE_SERIAL_PUSH", 0); } virtual ~CommCPU() { } - void Init(int key, const TShape& shape, int type = mshadow::kFloat32) override { - merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); + void Init(int key, const NDArrayStorageType stype, const TShape& shape, + int type = mshadow::kFloat32) override { + if (stype == kDefaultStorage) { + merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); + } else { + merge_buf_[key].merged = NDArray(stype, shape, pinned_ctx_, true, type); + } } const NDArray& Reduce(int key, const std::vector& src, int priority) override { + auto& buf = merge_buf_[key]; // avoid extra copy for single device, but it may bring problems for // abnormal usage of kvstore if (src.size() == 1) { - return src[0]; + if (src[0].storage_type() == kDefaultStorage) { + return src[0]; + } else { // if sparse and only one GPU, always update weight on CPU + CopyFromTo(src[0], &buf.merged, priority); + return buf.merged; + } } - std::vector const_vars(src.size() - 1); - std::vector reduce(src.size()); - auto& buf = merge_buf_[key]; - CopyFromTo(src[0], &buf.merged, priority); - reduce[0] = buf.merged; - if (buf.copy_buf.empty()) { - buf.copy_buf.resize(src.size()-1); - for (size_t j = 0; j < src.size() - 1; ++j) { - buf.copy_buf[j] = NDArray( - src[0].shape(), pinned_ctx_, false, src[0].dtype()); + if (buf.merged.storage_type() == kDefaultStorage) { + std::vector const_vars(src.size() - 1); + std::vector reduce(src.size()); + CopyFromTo(src[0], &buf.merged, priority); + reduce[0] = buf.merged; + + if (buf.copy_buf.empty()) { + buf.copy_buf.resize(src.size()-1); + for (size_t j = 0; j < src.size() - 1; ++j) { + // allocate NDArray basd on storage type + buf.copy_buf[j] = NDArray( + src[0].shape(), pinned_ctx_, false, src[0].dtype()); + } } - } - for (size_t i = 1; i < src.size(); ++i) { - CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority); - reduce[i] = buf.copy_buf[i-1]; - const_vars[i-1] = reduce[i].var(); - } + for (size_t i = 1; i < src.size(); ++i) { + CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority); + reduce[i] = buf.copy_buf[i-1]; + const_vars[i-1] = reduce[i].var(); + } + + Engine::Get()->PushSync([reduce, this](RunContext rctx) { + ReduceSumCPU(reduce); + }, Context::CPU(), const_vars, {reduce[0].var()}, + FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); - Engine::Get()->PushSync([reduce, this](RunContext rctx) { - ReduceSumCPU(reduce); - }, Context::CPU(), const_vars, {reduce[0].var()}, - FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); + } else { + // buf.merged is a sparse ndarray. + std::vector const_vars(src.size()); + std::vector reduce(src.size()); + + if (buf.copy_buf.empty()) { + buf.copy_buf.resize(src.size()); + for (size_t j = 0; j < src.size(); ++j) { + buf.copy_buf[j] = NDArray( + src[0].storage_type(), src[0].shape(), pinned_ctx_, true, src[0].dtype()); + } + } + for (size_t i = 0; i < src.size(); ++i) { + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + reduce[i] = buf.copy_buf[i]; + const_vars[i] = reduce[i].var(); + } + auto result = buf.merged; + Engine::Get()->PushSync([reduce, result, this](RunContext rctx) { + NDArray out = result; + is_serial_push_? + ReduceSumCPUExSerial(reduce, &out) + : mxnet::ndarray::ElementwiseSum(rctx.get_stream(), reduce, &out); + }, Context::CPU(), const_vars, {result.var()}, + FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce")); + } return buf.merged; } @@ -129,7 +188,113 @@ class CommCPU : public Comm { } } + void BroadcastRowSparse(int key, const NDArray& src, + const std::vector>& dst, + const bool use_copy, + const int priority) override { + using namespace mshadow; + CHECK_EQ(src.storage_type(), kRowSparseStorage) + << "BroadcastRowSparse expects row-sparse src NDArray"; + CHECK_EQ(src.ctx().dev_mask(), Context::kCPU) + << "BroadcastRowSparse with src on gpu context not supported"; + for (size_t i = 0; i < dst.size(); ++i) { + NDArray* out = dst[i].first; + NDArray row_id = dst[i].second; + if (use_copy) { + CopyFromTo(src, out, priority); + } else { + CHECK_EQ(out->storage_type(), kRowSparseStorage) + << "BroadcastRowSparse expects row_sparse dst NDArray"; + CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU) + << "BroadcastRowSparse with row_indices on gpu context not supported"; + // retain according to unique indices + const bool use_sparse_retain = (src.shape()[0] != src.storage_shape()[0]) + || (row_id.dtype() != out->aux_type(rowsparse::kIdx)) + || (out->ctx().dev_mask() != Context::kGPU); + if (use_sparse_retain) { // use sparse_retain op + const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU; + NDArray out_cpu = is_to_gpu? NDArray(kRowSparseStorage, src.shape(), + src.ctx(), true, src.dtype(), src.aux_types()) : *out; + Engine::Get()->PushSync([=](RunContext rctx) { + const TBlob& indices = row_id.data(); + NDArray temp = out_cpu; // get rid of const qualifier + op::SparseRetainOpForwardRspImpl(rctx.get_stream(), + src, indices, kWriteTo, + &temp); + }, Context::CPU(), {src.var(), row_id.var()}, {out_cpu.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain")); + if (is_to_gpu) { + CopyFromTo(out_cpu, out, priority); + } + } else { // direct copy rows + Engine::Get()->PushSync([=](RunContext rctx) { + CopyRetainedRowsToGPU(rctx.get_stream(), rctx.get_stream(), + src, row_id, out); + }, out->ctx(), {src.var(), row_id.var()}, {out->var()}, + FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("KVStoreCopyRetainedRowsToGPU")); + } + } + } + } + private: + /*! + * \brief When src is a rsp with full rows, + * simply copy retained rows directly from cpu to gpu + * without invoking sparse_retain op. + */ + void CopyRetainedRowsToGPU(mshadow::Stream* cpu_stream, + mshadow::Stream* gpu_stream, + const NDArray& src, + const NDArray& indices, + NDArray* dst) { +#if MXNET_USE_CUDA == 1 + CHECK_EQ(src.storage_type(), kRowSparseStorage) + << "CopyRetainedRowsToGPU expects row-sparse src NDArray"; + CHECK_EQ(src.ctx().dev_mask(), Context::kCPU) + << "CopyRetainedRowsToGPU with src on gpu context not supported"; + CHECK_EQ(src.storage_shape()[0], src.shape()[0]) + << "CopyRetainedRowsToGPU only supports src rsp with full rows"; + CHECK_EQ(indices.storage_type(), kDefaultStorage); + CHECK_EQ(indices.ctx().dev_mask(), Context::kCPU); + CHECK_EQ(dst->storage_type(), kRowSparseStorage); + CHECK_EQ(dst->ctx().dev_mask(), Context::kGPU); + CHECK_EQ(indices.dtype(), dst->aux_type(rowsparse::kIdx)) + << "CopyRetainedRowsToGPU only supports same data type for idx array and dst aux_data(0)"; + if (!src.storage_initialized() || indices.data().Size() == 0U) { + op::FillZerosRspImpl(gpu_stream, dst); + return; + } + using namespace mshadow; + + const TBlob& src_data = src.data(); + const TBlob& idx_data = indices.data(); + const size_t row_length = src.shape().ProdShape(1, src.shape().ndim()); + const size_t num_rows_retained = idx_data.Size(); + dst->CheckAndAlloc({Shape1(num_rows_retained)}); + TBlob dst_data = dst->data(); + TBlob dst_idx_data = dst->aux_data(rowsparse::kIdx); + MSHADOW_TYPE_SWITCH(src.dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(indices.dtype(), IType, { + // copy idx array + Tensor dst_idx_tensor = dst_idx_data.FlatTo1D(gpu_stream); + const Tensor idx_tensor = idx_data.FlatTo1D(cpu_stream); + Copy(dst_idx_tensor, idx_tensor, gpu_stream); + // copy src data + const Tensor src_data_tensor = src_data.get_with_shape( + Shape2(src_data.shape_[0], row_length), cpu_stream); + Tensor dst_data_tensor = dst_data.get_with_shape( + Shape2(dst_data.shape_[0], row_length), gpu_stream); + for (size_t i = 0; i < num_rows_retained; ++i) { + Copy(dst_data_tensor[i], src_data_tensor[idx_tensor[i]], gpu_stream); + } + }) + }) +#else + LOG(FATAL) << "GPU not enabled"; +#endif + } + // reduce sum into val[0] inline void ReduceSumCPU(const std::vector &in_data) { MSHADOW_TYPE_SWITCH(in_data[0].dtype(), DType, { @@ -144,6 +309,78 @@ class CommCPU : public Comm { }); } + // serial implementation of reduce sum for row sparse NDArray. + inline void ReduceSumCPUExSerial(const std::vector &in, NDArray *out) { + using namespace rowsparse; + using namespace mshadow; + auto stype = out->storage_type(); + CHECK_EQ(stype, kRowSparseStorage) << "Unexpected storage type " << stype; + size_t total_num_rows = 0; + size_t num_in = in.size(); + // skip the ones with empty indices and values + std::vector skip(num_in, false); + // the values tensor of the inputs + MSHADOW_TYPE_SWITCH(out->dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, { + std::vector> in_vals(num_in); + std::vector> in_indices(num_in); + // offset to the values tensor of all inputs + std::vector offsets(num_in, 0); + std::vector num_rows(num_in, 0); + for (size_t i = 0; i < num_in; i++) { + if (!in[i].storage_initialized()) { + skip[i] = true; + continue; + } + auto size = in[i].aux_shape(kIdx).Size(); + num_rows[i] = size; + total_num_rows += size; + in_vals[i] = in[i].data().FlatTo2D(); + in_indices[i] = in[i].aux_data(kIdx).FlatTo1D(); + } + std::vector indices; + indices.reserve(total_num_rows); + // gather indices from all inputs + for (size_t i = 0; i < num_in; i++) { + for (size_t j = 0; j < num_rows[i]; j++) { + indices.emplace_back(in_indices[i][j]); + } + } + CHECK_EQ(indices.size(), total_num_rows); + // dedup indices + std::sort(indices.begin(), indices.end()); + indices.resize(std::unique(indices.begin(), indices.end()) - indices.begin()); + // the one left are unique non-zero rows + size_t nnr = indices.size(); + // allocate memory for output + out->CheckAndAlloc({Shape1(nnr)}); + auto idx_data = out->aux_data(kIdx).FlatTo1D(); + auto val_data = out->data().FlatTo2D(); + + for (size_t i = 0; i < nnr; i++) { + // copy indices back + idx_data[i] = indices[i]; + bool zeros = true; + for (size_t j = 0; j < num_in; j++) { + if (skip[j]) continue; + size_t offset = offsets[j]; + if (offset < num_rows[j]) { + if (indices[i] == in_indices[j][offset]) { + if (zeros) { + Copy(val_data[i], in_vals[j][offset], nullptr); + zeros = false; + } else { + val_data[i] += in_vals[j][offset]; + } + offsets[j] += 1; + } + } + } + } + }); + }); + } + template inline static void ReduceSumCPU( const std::vector &dptr, size_t offset, index_t size) { @@ -209,6 +446,7 @@ class CommCPU : public Comm { std::unordered_map merge_buf_; size_t bigarray_bound_; int nthread_reduction_; + bool is_serial_push_; }; /** @@ -227,8 +465,13 @@ class CommDevice : public Comm { virtual ~CommDevice() { } - void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) override { - sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); + void Init(int key, const NDArrayStorageType stype, const TShape& shape, + int dtype = mshadow::kFloat32) override { + if (stype == kDefaultStorage) { + sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); + } else { + LOG(FATAL) << "storage type " << stype << " not implemented for device yet"; + } } const NDArray& Reduce(int key, const std::vector& src, @@ -296,6 +539,13 @@ class CommDevice : public Comm { } } + void BroadcastRowSparse(int key, const NDArray& src, + const std::vector>& dst, + const bool use_copy, + const int priority) override { + LOG(FATAL) << "Not implemented yet"; + } + private: void EnableP2P(const std::vector& devs) { #if MXNET_USE_CUDA diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index b64d7c6369bc..399754f5406d 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -25,6 +25,8 @@ #define MXNET_KVSTORE_KVSTORE_DIST_H_ #include #include +#include +#include #include "./kvstore_local.h" #include "mxnet/engine.h" #include "ps/ps.h" @@ -60,6 +62,7 @@ class KVStoreDist : public KVStoreLocal { } } bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); + log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } virtual ~KVStoreDist() { @@ -81,7 +84,7 @@ class KVStoreDist : public KVStoreLocal { const std::vector& values) override { CheckUnique(keys); for (size_t i = 0; i < keys.size(); ++i) { - comm_->Init(keys[i], values[i].shape(), values[i].dtype()); + comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } if (get_rank() == 0) { Push_(keys, values, 0, false); @@ -108,17 +111,20 @@ class KVStoreDist : public KVStoreLocal { int priority) override { std::vector uniq_keys; std::vector > grouped_vals; - GroupKVPairs(keys, values, &uniq_keys, &grouped_vals); + GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals); for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; // use the same array for merging to guarantee that pull always happens // after the previous push on this key auto& recv_buf = comm_buf_[key]; + const auto storage_type = grouped_vals[i][0]->storage_type(); + CHECK_EQ(storage_type, kDefaultStorage) + << "Expected stype of value to be kDefaultStorage"; if (recv_buf.is_none()) { // it may happen for the first time a no-rank-0 worker pull the weight. - recv_buf = NDArray( - grouped_vals[i][0]->shape(), pinned_ctx_, true, grouped_vals[i][0]->dtype()); + recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_, + true, grouped_vals[i][0]->dtype()); } auto pull_from_servers = [this, key, recv_buf]( RunContext rctx, Engine::CallbackOnComplete cb) { @@ -133,7 +139,7 @@ class KVStoreDist : public KVStoreLocal { auto vals = new ps::SArray(data, size, false); // issue pull CHECK_NOTNULL(ps_worker_)->ZPull( - pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); }); + pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); }; CHECK_NOTNULL(Engine::Get())->PushAsync( @@ -143,12 +149,55 @@ class KVStoreDist : public KVStoreLocal { {recv_buf.var()}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistPull")); + PROFILER_MESSAGE("KVStoreDistDefaultPull")); comm_->Broadcast(key, recv_buf, grouped_vals[i], priority); } } + void PullRowSparse(const std::vector& keys, + const std::vector>& val_rowids, + const int priority = 0) { + std::vector uniq_keys; + std::vector>> grouped_val_rowids; + GroupKVPairsPullRsp(keys, val_rowids, &uniq_keys, &grouped_val_rowids); + + for (size_t i = 0; i < uniq_keys.size(); ++i) { + int key = uniq_keys[i]; + // use the same array for merging to guarantee that pull always happens + // after the previous push on this key + auto& recv_buf = comm_buf_[key]; + auto& grouped_val_rowid = grouped_val_rowids[i]; + const auto storage_type = grouped_val_rowid[0].first->storage_type(); + CHECK_EQ(storage_type, kRowSparseStorage) + << "expected kRowSparseStorage, but got " << storage_type; + if (recv_buf.is_none()) { + // it may happen for the first time a no-rank-0 worker pull the weight. + recv_buf = NDArray(storage_type, grouped_val_rowid[0].first->shape(), + pinned_ctx_, true, grouped_val_rowid[0].first->dtype()); + } + auto &target_val_rowids = grouped_val_rowids[i]; + const size_t num_vals = target_val_rowids.size(); + size_t num_rows = 0; + // TODO(haibin) refactor this for loop + for (size_t i = 0; i < num_vals; i++) { + auto &row_id = target_val_rowids[i].second; + NDArray indices = row_id.Copy(pinned_ctx_); + Unique(&indices, priority); + target_val_rowids[i].second = indices; + num_rows += indices.shape().Size(); + } + if (num_vals > 1) { + // TODO(haibin) aggregate over all unique indices + LOG(FATAL) << "RowSparsePull with multiple values is not implemented yet"; + } else { + auto& indices = target_val_rowids[0].second; + PullRowSparse_(key, &recv_buf, indices, priority); + comm_->BroadcastRowSparse(key, recv_buf, grouped_val_rowid, num_vals == 1, priority); + } + } + } + void set_updater(const Updater& updater) override { CHECK(updater) << "invalid updater"; if (IsServerNode()) { @@ -212,7 +261,7 @@ class KVStoreDist : public KVStoreLocal { // first aggregate the values over keys std::vector uniq_keys; std::vector > grouped_vals; - GroupKVPairs(keys, values, &uniq_keys, &grouped_vals); + GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devcies @@ -221,42 +270,132 @@ class KVStoreDist : public KVStoreLocal { NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; auto& send_buf = comm_buf_[key]; + const auto storage_type = merged.storage_type(); if (merged.ctx().dev_mask() == cpu::kDevMask) { + // make sure the previous push/pull is completed + send_buf.WaitToWrite(); send_buf = merged; // avoid memory copy } else { if (send_buf.is_none()) { - send_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype()); + if (storage_type == kDefaultStorage) { + send_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype()); + } else { + send_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype()); + } } CopyFromTo(merged, &send_buf); } // push to servers + if (storage_type == kDefaultStorage) { auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { - // convert to ps keys - size_t size = send_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size); + // convert to ps keys + size_t size = send_buf.shape().Size(); + PSKV& pskv = EncodeKey(key, size); #if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); + mkl_set_tblob_eager_mode(send_buf.data()); #endif - real_t* data = static_cast(send_buf.data().dptr_); - // do push. false means no delete - ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); - }; - Engine::Get()->PushAsync( - push_to_servers, - pinned_ctx_, - {send_buf.var()}, - {}, - FnProperty::kNormal, - priority, - PROFILER_MESSAGE("KVStoreDistPush")); + real_t* data = static_cast(send_buf.data().dptr_); + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {send_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistDefaultPush")); + } else if (storage_type == kRowSparseStorage) { + PushRowSparse(key, send_buf, priority); + } else { + LOG(FATAL) << "unknown storage type"; + } } } + // pull row sparse weight into `recv_buf` based on indices given by `indices` + void PullRowSparse_(int key, NDArray *recv_buf, const NDArray& indices, int priority) { + using namespace rowsparse; + auto pull_from_servers = [this, key, recv_buf, indices] + (RunContext rctx, Engine::CallbackOnComplete cb) { + // allocate memory for the buffer + size_t num_rows = indices.shape().Size(); + recv_buf->CheckAndAlloc({mshadow::Shape1(num_rows)}); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(recv_buf->data()); +#endif + real_t* data = static_cast(recv_buf->data().dptr_); + auto indices_data = indices.data(); + const auto offsets = indices_data.dptr(); + const auto unit_len = recv_buf->shape().ProdShape(1, recv_buf->shape().ndim()); + const int64_t size = num_rows * unit_len; + // convert to ps keys in row sparse format + PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets, + unit_len, recv_buf->shape()[0]); + if (this->log_verbose_) { + LOG(INFO) << "worker " << get_rank() << " pull lens: " << pskv.lens << " keys: " + << pskv.keys << " size: " << size; + } + auto vals = new ps::SArray(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, kRowSparsePushPull, + [vals, cb]() { delete vals; cb(); }); + // copy indices to recv_buf + mshadow::Copy(recv_buf->aux_data(kIdx).FlatTo1D(), + indices_data.FlatTo1D()); + }; + CHECK_NOTNULL(Engine::Get())->PushAsync( + pull_from_servers, + pinned_ctx_, + {indices.var()}, + {recv_buf->var()}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistRowSparsePull")); + } + + // push row sparse gradient + void PushRowSparse(int key, const NDArray &send_buf, int priority) { + using namespace rowsparse; + auto push_to_servers = [this, key, &send_buf] + (RunContext rctx, Engine::CallbackOnComplete cb) { +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(send_buf.data()); +#endif + real_t* data = static_cast(send_buf.data().dptr_); + bool init = send_buf.storage_initialized(); + const int64_t num_rows = init ? send_buf.aux_shape(kIdx)[0] : 0; + const auto offsets = init ? send_buf.aux_data(kIdx).dptr() : nullptr; + const auto unit_len = send_buf.shape().ProdShape(1, send_buf.shape().ndim()); + const int64_t size = num_rows * unit_len; + + // convert to ps keys in row sparse format + PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets, + unit_len, send_buf.shape()[0]); + if (this->log_verbose_) { + LOG(INFO) << "worker " << get_rank() << " push lens: " << pskv.lens << " keys: " + << pskv.keys << " size: " << size; + } + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush(pskv.keys, vals, pskv.lens, kRowSparsePushPull, [cb]() { + cb(); + }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {send_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistRowSparsePush")); + } + /** * \brief check if the keys are all unique */ @@ -282,7 +421,7 @@ class KVStoreDist : public KVStoreLocal { std::unordered_map ps_kv_; /** - * \brief serizelize EncodeKey + * \brief serizelize EncodeRowSparseKey and EncodeKey */ std::mutex mu_; @@ -329,6 +468,64 @@ class KVStoreDist : public KVStoreLocal { return pskv; } + // TODO(haibin) this encoding method for row sparse keys doesn't allow cross-layer batching + inline PSKV& EncodeRowSparseKey(const int key, const int64_t size, const int64_t num_rows, + const int64_t *offsets, const size_t unit_len, + const int64_t total_num_rows) { + using namespace common; + mu_.lock(); + PSKV& pskv = ps_kv_[key]; + mu_.unlock(); + pskv.keys.clear(); + pskv.lens.clear(); + // TODO(haibin) cache this information + auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); + int num_servers = krs.size(); + CHECK_GT(num_servers, 0); + + if (total_num_rows * unit_len >= bigarray_bound_) { + pskv.size = 0; + int64_t start_row = 0; + // parition it to all servers + for (int i = 0; i < num_servers; ++i) { + // calculate partition ranges + int64_t part_num_rows = + llround(static_cast(total_num_rows) / num_servers * (i + 1)) - + llround(static_cast(total_num_rows) / num_servers * i); + auto end_row = start_row + part_num_rows; + auto lb = std::lower_bound(offsets, offsets + num_rows, start_row); + auto ub = std::upper_bound(offsets, offsets + num_rows, end_row - 1); + ps::Key master_key = krs[i].begin() + key; + pskv.keys.push_back(master_key); + pskv.lens.push_back(0); + for (auto offset = lb; offset < ub; offset++) { + ps::Key ps_key = krs[i].begin() + key + (*offset - start_row); + CHECK_LT(ps_key, krs[i].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(unit_len); + pskv.size += unit_len; + } + start_row = end_row; + } + CHECK_EQ(static_cast(pskv.size), size); + } else { + // send it to a single random picked server + int server = (key * 9973) % num_servers; + ps::Key master_key = krs[server].begin() + key; + pskv.keys.push_back(master_key); + pskv.lens.push_back(0); + for (int64_t i = 0; i < num_rows; i++) { + ps::Key ps_key = krs[server].begin() + key + offsets[i]; + CHECK_LT(ps_key, krs[server].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(unit_len); + } + pskv.size = size; + } + return pskv; + } + + /** * \brief for worker to push and pull data */ @@ -343,6 +540,7 @@ class KVStoreDist : public KVStoreLocal { size_t bigarray_bound_; /// \brief send & recver buffer std::unordered_map comm_buf_; + bool log_verbose_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 4e9f887173c5..43a10b034ca6 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -33,10 +33,14 @@ #include #include "ps/ps.h" #include "mxnet/kvstore.h" +#include "../operator/tensor/elemwise_binary_op.h" +#include "../operator/tensor/init_op.h" namespace mxnet { namespace kvstore { +static const int kRowSparsePushPull = 1; +static const int kDefaultPushPull = 0; static const int kStopServer = -1; static const int kSyncMode = -2; @@ -110,8 +114,9 @@ class KVStoreDistServer { static_cast(ps_server_)->set_request_handle( std::bind(&KVStoreDistServer::CommandHandle, this, _1, _2)); ps_server_->set_request_handle( - std::bind(&KVStoreDistServer::DataHandle, this, _1, _2, _3)); + std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; + log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } ~KVStoreDistServer() { @@ -136,6 +141,11 @@ class KVStoreDistServer { } private: + struct MergeBuf { + std::vector request; + NDArray array; + }; + void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) { if (recved.head == kStopServer) { exec_.Stop(); @@ -151,9 +161,205 @@ class KVStoreDistServer { app->Response(recved); } - void DataHandle(const ps::KVMeta& req_meta, - const ps::KVPairs& req_data, - ps::KVServer* server) { + void DataHandleEx(const ps::KVMeta& req_meta, + const ps::KVPairs& req_data, + ps::KVServer* server) { + if (req_meta.cmd == kRowSparsePushPull) { + DataHandleRowSparse(req_meta, req_data, server); + } else { + DataHandleDefault(req_meta, req_data, server); + } + return; + } + + inline void ApplyUpdates(const int key, MergeBuf *merged, NDArray *stored, + ps::KVServer* server) { + if (merged->request.size() == (size_t) ps::NumWorkers()) { + // let the main thread to execute updater_, which is necessary for python + if (updater_) { + exec_.Exec([this, key, merged, stored](){ + CHECK(updater_); + updater_(key, merged->array, stored); + }); + } else { + // if no updater, just copy + CopyFromTo(merged->array, stored); + } + if (log_verbose_) { + LOG(INFO) << "sync response to " << merged->request.size() << " workers"; + } + for (const auto& req : merged->request) { + server->Response(req); + } + merged->request.clear(); + stored->WaitToRead(); + } else { + merged->array.WaitToRead(); + } + } + + void DecodeRowIds(const ps::SArray &keys, int64_t *indices, + const int64_t master_key, const int64_t num_rows) { + indices[0] = 0; + for (int64_t i = 1; i <= num_rows; i++) { + int key = DecodeKey(keys[i]); + auto row_id = key - master_key; + indices[i - 1] = row_id; + } + } + + void DataHandleRowSparse(const ps::KVMeta& req_meta, + const ps::KVPairs& req_data, + ps::KVServer* server) { + int master_key = DecodeKey(req_data.keys[0]); + auto num_rows = req_data.keys.size() - 1; + auto& stored = store_[master_key]; + if (req_meta.push) { + CHECK_GT(req_data.lens.size(), 0) << "req_data.lens cannot be empty"; + CHECK_EQ(req_data.lens[0], 0); + real_t* data = req_data.vals.data(); + if (stored.is_none()) { + if (log_verbose_) LOG(INFO) << "initial push: " << master_key; + // initialization + CHECK_GT(num_rows, 0) << "init with empty data is not supported"; + auto unit_len = req_data.lens[1]; + CHECK_GT(unit_len, 0); + size_t ds[] = {num_rows, (size_t) unit_len}; + TShape dshape(ds, ds + 2); + CHECK_EQ(req_data.vals.size(), num_rows * unit_len); + TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*) + NDArray recved = NDArray(recv_blob, 0); + stored = NDArray(kRowSparseStorage, dshape, Context()); + Engine::Get()->PushSync([recved, stored](RunContext ctx) { + NDArray rsp = stored; + stored.CheckAndAlloc({mshadow::Shape1(recved.shape()[0])}); + mshadow::Stream *s = ctx.get_stream(); + op::PopulateFullIdxRspImpl(s, &rsp); + mshadow::Copy(rsp.data().FlatTo1D(), + recved.data().FlatTo1D(), s); + }, recved.ctx(), {recved.var()}, {stored.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + stored.WaitToRead(); + server->Response(req_meta); + return; + } + // synced push + if (sync_mode_) { + if (log_verbose_) LOG(INFO) << "sync push: " << master_key << " " << req_data.keys; + auto& merged = merge_buf_[master_key]; + if (merged.array.is_none()) { + merged.array = NDArray(kRowSparseStorage, stored.shape(), Context()); + } + if (num_rows == 0) { + // reset to zeros + if (merged.request.size() == 0) { + merged.array = NDArray(kRowSparseStorage, stored.shape(), Context()); + } else { + // nothing to aggregate + } + merged.request.push_back(req_meta); + ApplyUpdates(master_key, &merged, &stored, server); + return; + } + auto unit_len = req_data.lens[1]; + CHECK_GT(unit_len, 0); + // indices + std::vector indices(num_rows); + DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows); + // data + TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask); + size_t ds[] = {(size_t) num_rows, (size_t) unit_len}; + TShape dshape(ds, ds + 2); + TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*) + // row_sparse NDArray + NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0); + + if (merged.request.size() == 0) { + CopyFromTo(recved, &merged.array, 0); + } else { + NDArray out(kRowSparseStorage, stored.shape(), Context()); + std::vector const_vars; + const_vars.push_back(recved.var()); + const_vars.push_back(merged.array.var()); + // accumulate row_sparse gradients + // TODO(haibin) override + operator for row_sparse NDArray + // instead of calling BinaryComputeRspRsp directly + using namespace mshadow; + Engine::Get()->PushSync([recved, merged, out](RunContext ctx) { + std::vector inputs, outputs; + inputs.push_back(recved); + inputs.push_back(merged.array); + outputs.push_back(out); + op::BinaryComputeRspRspImpl({}, {}, inputs, {kWriteTo}, outputs); + }, recved.ctx(), const_vars, {out.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + CopyFromTo(out, &merged.array, 0); + } + merged.request.push_back(req_meta); + ApplyUpdates(master_key, &merged, &stored, server); + } else { + // async push + if (log_verbose_) LOG(INFO) << "async push: " << master_key; + if (num_rows == 0) { + server->Response(req_meta); + return; + } + auto unit_len = req_data.lens[1]; + CHECK_GT(unit_len, 0); + // indices + std::vector indices(num_rows); + DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows); + TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask); + size_t ds[] = {(size_t) num_rows, (size_t) unit_len}; + TShape dshape(ds, ds + 2); + TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*) + NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0); + exec_.Exec([this, master_key, &recved, &stored](){ + CHECK(updater_); + updater_(master_key, recved, &stored); + }); + server->Response(req_meta); + stored.WaitToRead(); + } + } else { + // pull + if (log_verbose_) LOG(INFO) << "pull: " << master_key; + ps::KVPairs response; + if (num_rows == 0) { + std::vector lens(req_data.keys.size(), 0); + response.keys = req_data.keys; + response.lens.CopyFrom(lens.begin(), lens.end()); + server->Response(req_meta, response); + return; + } + CHECK(!stored.is_none()) << "init " << master_key << " first"; + auto shape = stored.shape(); + auto unit_len = shape.ProdShape(1, shape.ndim()); + const float* data = stored.data().dptr(); + auto len = unit_len * num_rows; + // concat values + response.vals.resize(len); + for (size_t i = 1; i <= num_rows; i++) { + int key = DecodeKey(req_data.keys[i]); + int64_t row_id = key - master_key; + const auto src = data + row_id * unit_len; + auto begin = (i - 1) * unit_len; + auto end = i * unit_len; + response.vals.segment(begin, end).CopyFrom(src, unit_len); + } + // setup response + response.keys = req_data.keys; + std::vector lens(req_data.keys.size(), unit_len); + lens[0] = 0; + response.lens.CopyFrom(lens.begin(), lens.end()); + server->Response(req_meta, response); + } + } + + void DataHandleDefault(const ps::KVMeta& req_meta, + const ps::KVPairs &req_data, + ps::KVServer* server) { + CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); if (req_meta.push) { @@ -185,35 +391,13 @@ class KVStoreDistServer { if (merged.array.is_none()) { merged.array = NDArray(dshape, Context()); } - if (merged.request.size() == 0) { CopyFromTo(recved, &merged.array, 0); } else { merged.array += recved; } - merged.request.push_back(req_meta); - - if (merged.request.size() == (size_t)ps::NumWorkers()) { - // let the main thread to execute updater_, which is necessary for - // python - if (updater_) { - exec_.Exec([this, key, &merged, &stored](){ - CHECK(updater_); - updater_(key, merged.array, &stored); - }); - } else { - // if no updater, just copy - CopyFromTo(merged.array, &stored); - } - for (const auto& req : merged.request) { - server->Response(req); - } - merged.request.clear(); - stored.WaitToRead(); - } else { - merged.array.WaitToRead(); - } + ApplyUpdates(key, &merged, &stored, server); } else { // async push exec_.Exec([this, key, &recved, &stored](){ @@ -227,7 +411,7 @@ class KVStoreDistServer { // pull ps::KVPairs response; CHECK(!stored.is_none()) << "init " << key << " first"; - int len = stored.shape()[0]; + auto len = stored.shape().Size(); response.keys = req_data.keys; response.lens = {len}; // TODO(mli) try to remove this CopyFrom @@ -249,16 +433,13 @@ class KVStoreDistServer { KVStore::Updater updater_; std::unordered_map store_; - - struct MergeBuf { - std::vector request; - NDArray array; - }; std::unordered_map merge_buf_; Executor exec_; - ps::KVServer* ps_server_; + + // whether to LOG verbose information + bool log_verbose_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 536a89b46e13..11d4b644346e 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "./comm.h" @@ -62,7 +63,7 @@ class KVStoreLocal : public KVStore { CHECK(local_.find(keys[i]) == local_.end()) << "duplicate init of key " << keys[i]; local_[keys[i]] = values[i].Copy(pinned_ctx_); - comm_->Init(keys[i], values[i].shape(), values[i].dtype()); + comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } } @@ -85,7 +86,7 @@ class KVStoreLocal : public KVStore { int priority) override { std::vector uniq_keys; std::vector > grouped_vals; - GroupKVPairs(keys, values, &uniq_keys, &grouped_vals); + GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; @@ -100,7 +101,11 @@ class KVStoreLocal : public KVStore { } updater_(key, merged, &local); } else { - local = merged; + if (merged.storage_type() != local.storage_type()) { + local = merged.Copy(local.ctx()); + } else { + local = merged; + } } } } @@ -110,7 +115,7 @@ class KVStoreLocal : public KVStore { int priority) override { std::vector uniq_keys; std::vector > grouped_vals; - GroupKVPairs(keys, values, &uniq_keys, &grouped_vals); + GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals); for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; @@ -120,6 +125,30 @@ class KVStoreLocal : public KVStore { } } + void PullRowSparse(const std::vector& keys, + const std::vector>& val_rowids, + int priority = 0) override { + std::vector uniq_keys; + std::vector>> grouped_val_rowids; + GroupKVPairsPullRsp(keys, val_rowids, &uniq_keys, &grouped_val_rowids); + for (size_t i = 0; i < uniq_keys.size(); ++i) { + int key = uniq_keys[i]; + const NDArray& local = local_[key]; + CHECK(!local.is_none()) << "key " << key << " has not been inited"; + CHECK_EQ(local.storage_type(), kRowSparseStorage) + << "PullRowSparse expects row_sparse src NDArray"; + auto &target_val_rowids = grouped_val_rowids[i]; + const size_t num_vals = target_val_rowids.size(); + for (size_t i = 0; i < num_vals; i++) { + auto &row_id = target_val_rowids[i].second; + NDArray indices = row_id.Copy(pinned_ctx_); + Unique(&indices, priority); + target_val_rowids[i].second = indices; + } + comm_->BroadcastRowSparse(key, local, grouped_val_rowids[i], false, priority); + } + } + void Push(const std::vector& str_keys, const std::vector& values, int priority) override { @@ -136,15 +165,85 @@ class KVStoreLocal : public KVStore { Pull(keys, values, priority); } + void PullRowSparse(const std::vector& str_keys, + const std::vector>& val_rowids, + const int priority = 0) override { + std::vector keys(str_keys.size()); + LookupKeys(str_keys, &keys); + PullRowSparse(keys, val_rowids, priority); + } + protected: /** - * \brief group values on keys + * \brief group values on keys for push */ - template + void GroupKVPairsPush(const std::vector& keys, + const std::vector& values, + std::vector *uniq_keys, + std::vector> *grouped_vals) { + // check if the storage type of a value is valid + auto validator = [this](const int key, const NDArray& nd) -> bool { + auto stype = nd.storage_type(); + // valid NDArray + if (stype == kDefaultStorage || stype == kRowSparseStorage) return true; + // invalid NDArray, abort + LOG(FATAL) << "Unexpected storage type detected during kvstore push: " << stype; + return false; + }; + GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator); + } + /** + * \brief group values on keys for pull + */ + void GroupKVPairsPull(const std::vector& keys, + const std::vector& values, + std::vector *uniq_keys, + std::vector> *grouped_vals) { + // check if the storage type of a value is valid + auto validator = [this](const int key, const NDArray* nd) -> bool { + // valid + if (nd->storage_type() == kDefaultStorage) return true; + // invalid, print warning messages once + if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) { + LOG(INFO) << "Warning: non-default weights detected during kvstore pull. " + << "Please make sure to use row_sparse_pull with row_ids instead."; + this->warnings_printed_.insert(key); + } + return false; + }; + GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator); + } + /** + * \brief group values on keys for row_sparse_pull + */ + void GroupKVPairsPullRsp(const std::vector& keys, + const std::vector>& values, + std::vector *uniq_keys, + std::vector>> *grouped_vals) { + // check if the storage type of a value is valid + auto validator = [this](const int key, const std::pair& val_rowid) -> bool { + auto val_stype = val_rowid.first->storage_type(); + auto rowid_stype = val_rowid.second.storage_type(); + // check storage types + CHECK_EQ(val_stype, kRowSparseStorage) << "Expected row_sparse storage type for " + << "row_sparse_pull values, but detected storage type " << val_stype; + CHECK_EQ(rowid_stype, kDefaultStorage) << "Expected default storage type for " + << "row_sparse_pull rowids, but detected storage type " << rowid_stype; + return true; + }; + GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator); + } + + /** + * \brief group values on keys with validation. + * A value `v` is not included in the result if is_valid(v) returns false. + */ + template void GroupKVPairs(const std::vector& keys, const std::vector& values, std::vector* uniq_keys, - std::vector >* grouped_vals) { + std::vector >* grouped_vals, + const FValidate& is_valid) { CHECK_EQ(keys.size(), values.size()); // TODO(mli) check if already sorted as an optimization using Idx = std::pair; @@ -158,12 +257,14 @@ class KVStoreLocal : public KVStore { int pre_key = idx[0].first - 1; for (auto i : idx) { - if (i.first != pre_key) { - uniq_keys->push_back(i.first); - grouped_vals->push_back({values[i.second]}); - pre_key = i.first;; - } else { - grouped_vals->back().push_back(values[i.second]); + if (is_valid(i.first, values[i.second])) { + if (i.first != pre_key) { + uniq_keys->push_back(i.first); + grouped_vals->push_back({values[i.second]}); + pre_key = i.first; + } else { + grouped_vals->back().push_back(values[i.second]); + } } } } @@ -178,6 +279,28 @@ class KVStoreLocal : public KVStore { } } + /** + * \brief sort and get unique values. Output is expected to be on cpu_pinned context + */ + void Unique(NDArray *out, int priority = 0) { + CHECK_EQ(out->ctx().dev_mask(), pinned_ctx_.dev_mask()) + << "Unique expects input with `pinned_ctx_`"; + Engine::Get()->PushSync([out](RunContext rctx) { + NDArray *output = out; + CHECK_EQ(out->shape().ndim(), 1) << "Unique expects 1D inputs"; + const auto size = out->shape()[0]; + auto out_data = output->data(); + MSHADOW_IDX_TYPE_SWITCH(out_data.type_flag_, IType, { + auto dptr = output->data().dptr(); + common::ParallelSort(dptr, dptr + size, omp_get_max_threads()); + auto num_unique_idx = std::unique(dptr, dptr + size) - dptr; + *output = output->Reshape(mshadow::Shape1(num_unique_idx)); + }); + }, pinned_ctx_, {}, {out->var()}, + FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreUnique")); + out->WaitToRead(); + } + /// reducer and broadcaster Comm* comm_; /// pinned context @@ -188,6 +311,8 @@ class KVStoreLocal : public KVStore { std::unordered_map str_key_dict_; /// the next available integer for string->int key mapping int next_str_key_ = 0; + /// whether printed warning due to mismatch stype in each key + std::unordered_set warnings_printed_; }; } // namespace kvstore } // namespace mxnet diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 8e71df729b73..0d2968626d79 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -30,6 +30,9 @@ #include #include #include "./ndarray_function.h" +#include "../common/utils.h" +#include "../operator/tensor/matrix_op-inl.h" +#include "../operator/tensor/init_op.h" #include "./autograd.h" #if MXNET_USE_OPENCV @@ -52,6 +55,8 @@ NDArray NDArray::grad() const { NDArray NDArray::Reshape(const TShape &shape) const { using namespace autograd; + CHECK(storage_type() == kDefaultStorage) << "Reshape for storage type " << + storage_type() << " is not implemented yet"; if (AutogradRuntime::Get()->IsTraining()) { CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape must have must have the same size as " @@ -82,13 +87,15 @@ NDArray NDArray::Reshape(const TShape &shape) const { } } - NDArray NDArray::Slice(index_t begin, index_t end) const { using namespace autograd; - NDArray ret = *this; + using namespace mshadow; CHECK(!is_none()) << "NDArray is not initialized"; CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; + CHECK_EQ(storage_type(), kDefaultStorage); + NDArray ret = *this; + auto stype = storage_type(); size_t length = shape_.ProdShape(1, shape_.ndim()); MSHADOW_TYPE_SWITCH(ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); @@ -115,8 +122,9 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { } } - NDArray NDArray::At(index_t idx) const { + CHECK(storage_type() == kDefaultStorage) << "Storage type " + << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -125,6 +133,24 @@ NDArray NDArray::At(index_t idx) const { } } +/*! + * \brief Return deep copy of the current ndarry's aux_data(i) + * as an NDArray of default storage type. This function blocks. + */ +NDArray NDArray::aux_ndarray(size_t i) const { + CHECK_NE(storage_type(), kDefaultStorage); + CHECK(i < ptr_->aux_shapes.size()); + // create a delay_alloc default ndarray as output + NDArray ret(TShape(), ctx(), true, aux_type(i)); + ret.SyncCopyFromNDArray(*this, i); + return ret; +} + +NDArray NDArray::data_ndarray() const { + NDArray ret(TShape(), ctx(), true, dtype_); + ret.SyncCopyFromNDArray(*this); + return ret; +} bool NDArray::fresh_out_grad() const { if (entry_.ag_node != nullptr) return entry_.ag_node->fresh_out_grad; @@ -239,11 +265,11 @@ void BinaryOp(const NDArray &lhs, // redirect everything to mshadow operations switch (lhs.ctx().dev_mask()) { case cpu::kDevMask: { - Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Eval(lhs.data(), rhs.data(), &tmp, ctx); - }, lhs.ctx(), const_vars, {ret.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) { + TBlob tmp = ret.data(); + ndarray::Eval(lhs.data(), rhs.data(), &tmp, ctx); + }, lhs.ctx(), const_vars, {ret.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); break; } #if MXNET_USE_CUDA @@ -269,6 +295,7 @@ void SetValueOp(const real_t &rhs, NDArray *out) { switch (ret.ctx().dev_mask()) { case cpu::kDevMask: { Engine::Get()->PushSync([rhs, ret](RunContext ctx) { + CHECK(ret.storage_type() == kDefaultStorage); TBlob tmp = ret.data(); ndarray::Eval(rhs, &tmp, ctx); }, ret.ctx(), {}, {ret.var()}, @@ -340,6 +367,134 @@ void ScalarOp(const NDArray &lhs, } } +size_t num_aux_data(NDArrayStorageType stype) { + size_t num = 0; + switch (stype) { + case kDefaultStorage: num = 0; break; + case kCSRStorage: num = 2; break; + case kRowSparseStorage: num = 1; break; + default: LOG(FATAL) << "Unknown storage type" << stype; break; + } + return num; +} + +// Make a copy of a CSR NDArray +template +inline void CopyFromToCsrImpl(const NDArray from, NDArray *to, RunContext ctx) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + // if source storage is not initialized, fill destination with zeros + auto s = ctx.get_stream(); + if (!from.storage_initialized()) { + op::FillZerosCsrImpl(s, to); + return; + } + // Allocate storage + to->CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr)); + to->CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx)); + to->CheckAndAllocData(from.aux_shape(csr::kIdx)); + TBlob val = to->data(); + TBlob indptr = to->aux_data(csr::kIndPtr); + TBlob idx = to->aux_data(csr::kIdx); + ndarray::Copy(from.data(), &val, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(csr::kIndPtr), &indptr, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(csr::kIdx), &idx, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of a row-sparse NDArray +template +inline void CopyFromToRspImpl(const NDArray from, NDArray *to, RunContext ctx) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + // if source is zeros, fill destination with zeros, too + auto s = ctx.get_stream(); + if (!from.storage_initialized()) { + op::FillZerosRspImpl(s, to); + return; + } + auto aux_shape = from.aux_shape(rowsparse::kIdx); + to->CheckAndAlloc({aux_shape}); + TBlob val = to->data(); + TBlob idx = to->aux_data(rowsparse::kIdx); + ndarray::Copy(from.data(), &val, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(rowsparse::kIdx), &idx, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of a dense NDArray +template +inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + TBlob tmp = to->data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of an NDArray based on storage type +template +void CopyFromToImpl(const NDArray from, NDArray *to, RunContext rctx) { + using namespace std; + using namespace mshadow; + // if storage type doesn't match, cast the storage first + auto from_stype = from.storage_type(); + auto to_stype = to->storage_type(); + CHECK(from_stype == kDefaultStorage + || to_stype == kDefaultStorage + || from_stype == to_stype) + << "Copying ndarray of stype = " << from_stype + << " to stype = " << to_stype << " is not supported"; + const auto from_ctx = from.ctx(); + const auto to_ctx = to->ctx(); + auto s = rctx.get_stream(); + bool is_train = mxnet::autograd::AutogradRuntime::Get()->IsTraining(); + std::vector requested; + if (is_same::value && from_stype != to_stype) { + requested.push_back(ResourceManager::Get()->Request(from_ctx, + ResourceRequest(ResourceRequest::kTempSpace))); + } + OpContext opctx{is_train, + rctx, + engine::CallbackOnComplete(), + requested}; + if (from_ctx == to_ctx && from_stype != to_stype) { + // same ctx, different stypes, use cast op directly without copying + common::CastStorageDispatch(opctx, from, *to); + } else { + NDArray casted_nd; // an intermediate result before copying from to to + if (from_stype == to_stype) { + casted_nd = from; // same stype, no need to cast from + } else { // different stypes on different ctx needs an temporary casted_nd + TShape shape = from.shape(); + if (to_stype == kDefaultStorage) { + casted_nd = NDArray(shape, from_ctx); + } else { + casted_nd = NDArray(to_stype, shape, from_ctx); + } + // convert from_nd to the same stype as to_nd + common::CastStorageDispatch(opctx, from, casted_nd); + } + + if (to_stype == kDefaultStorage) { + CopyFromToDnsImpl(casted_nd, to, rctx); + } else if (to_stype == kRowSparseStorage) { + CopyFromToRspImpl(casted_nd, to, rctx); + } else if (to_stype == kCSRStorage) { + CopyFromToCsrImpl(casted_nd, to, rctx); + } else { + LOG(FATAL) << "unknown storage type" << to_stype; + } + } + if (is_same::value || is_same::value) { + // Wait GPU kernel to complete + rctx.get_stream()->Wait(); + } +} + void CopyFromTo(const NDArray &from, NDArray *to, int priority) { if (from.var() == to->var()) { // skip to copy to itself @@ -354,44 +509,33 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) { NDArray ret = *to; int a = from.ctx().dev_mask(); int b = to->ctx().dev_mask(); - std::vector const_vars; if (from.var() != ret.var()) const_vars.push_back(from.var()); if (a == cpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU")); } else { #if MXNET_USE_CUDA if (a == cpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, ret.ctx(), const_vars, {ret.var()}, FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU")); } else if (a == gpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU")); } else if (a == gpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushSync([from, ret](RunContext ctx) { - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx); }, from.ctx(), const_vars, {ret.var()}, from.dtype() != ret.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2GPU")); @@ -665,34 +809,76 @@ NDArray &NDArray::operator/=(const real_t &src) { /* magic number for ndarray version 1, with int64_t TShape */ static const uint32_t NDARRAY_V1_MAGIC = 0xF993fac8; +/* magic number for ndarray version 2, with storage type */ +static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9; + void NDArray::Save(dmlc::Stream *strm) const { - strm->Write(NDARRAY_V1_MAGIC); + // write magic number to mark this version + // for storage type + strm->Write(NDARRAY_V2_MAGIC); + + // save storage type + int32_t stype = storage_type(); + strm->Write(&stype, sizeof(stype)); + + const int32_t nad = num_aux_data(storage_type()); + // save storage shape if ndarray is sparse + if (nad > 0) { + storage_shape().Save(strm); + } + + // save shape shape_.Save(strm); if (is_none()) return; + // save context Context ctx = this->ctx(); ctx.Save(strm); TBlob save_data; - NDArray temp; + NDArray nd_cpu; // a copy of *this on cpu if (ctx.dev_mask() != cpu::kDevMask) { - temp = this->Copy(Context::CPU()); - temp.WaitToRead(); - save_data = temp.data(); + nd_cpu = this->Copy(Context::CPU()); + nd_cpu.WaitToRead(); + save_data = nd_cpu.data(); } else { this->WaitToRead(); save_data = this->data(); + nd_cpu = *this; } + // save type flag int32_t type_flag = save_data.type_flag_; strm->Write(&type_flag, sizeof(type_flag)); + + // save aux_types and aux_shapes + if (nad > 0) { + for (int i = 0; i < nad; ++i) { + int32_t aux_type_flag = aux_type(i); + strm->Write(&aux_type_flag, sizeof(aux_type_flag)); + aux_shape(i).Save(strm); + } + } + + // save data CHECK(save_data.CheckContiguous()); size_t type_size = mshadow::mshadow_sizeof(type_flag); - strm->Write(save_data.dptr_, type_size * shape_.Size()); + // save data could be values of sparse tensors + // must use save_data.shape_ instead of this->shape_ + strm->Write(save_data.dptr_, type_size * save_data.shape_.Size()); + + // save aux data + if (nad > 0) { + for (int i = 0; i < nad; ++i) { + TBlob save_data = nd_cpu.aux_data(i); + // save aux_data + CHECK(save_data.CheckContiguous()); + size_t aux_type_size = mshadow::mshadow_sizeof(aux_type(i)); + strm->Write(save_data.dptr_, aux_type_size * save_data.Size()); + } + } } -bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) { - uint32_t magic; - if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false; +bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape, const uint32_t magic) { switch (magic) { case NDARRAY_V1_MAGIC: return shape->Load(strm); @@ -708,10 +894,10 @@ bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) { } } -bool NDArray::Load(dmlc::Stream *strm) { +bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) { // load shape TShape shape; - if (!LegacyTShapeLoad(strm, &shape)) return false; + if (!LegacyTShapeLoad(strm, &shape, magic)) return false; if (shape.ndim() == 0) { *this = NDArray(); return true; } @@ -739,6 +925,88 @@ bool NDArray::Load(dmlc::Stream *strm) { } } +bool NDArray::Load(dmlc::Stream *strm) { + uint32_t magic; + if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false; + if (magic != NDARRAY_V2_MAGIC) { + return LegacyLoad(strm, magic); + } + + // load storage type + int32_t stype; + if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false; + const int32_t nad = num_aux_data(static_cast(stype)); + + // load storage shape + TShape sshape; + if (nad > 0) { + if (!sshape.Load(strm)) return false; + } + + // load shape + TShape shape; + if (!shape.Load(strm)) return false; + if (shape.ndim() == 0) { + *this = NDArray(); return true; + } + + // load context + Context ctx; + if (!ctx.Load(strm)) return false; + + // load type flag + int32_t type_flag; + if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false; + + // load aux_types and aux_shapes + std::vector aux_types; + std::vector aux_shapes; + if (nad > 0) { + aux_types.resize(nad); + aux_shapes.resize(nad); + for (int i = 0; i < nad; ++i) { + // load aux_type(i) + if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false; + // load aux_shapes(i) + if (!aux_shapes[i].Load(strm)) return false; + } + } + + // load data into CPU + NDArray temp; + if (0 == nad) { + temp = NDArray(shape, Context::CPU(), false, type_flag); + } else { + temp = NDArray(static_cast(stype), shape, + Context::CPU(), false, type_flag, + aux_types, aux_shapes, sshape); + } + // load data + TBlob load_data = temp.data(); + size_t type_size = mshadow::mshadow_sizeof(type_flag); + size_t nread = type_size * load_data.Size(); + if (strm->Read(load_data.dptr_, nread) != nread) return false; + + // load aux_data + if (nad > 0) { + for (int i = 0; i < nad; ++i) { + load_data = temp.aux_data(i); + type_size = mshadow::mshadow_sizeof(load_data.type_flag_); + nread = type_size * load_data.Size(); + if (strm->Read(load_data.dptr_, nread) != nread) return false; + } + } + + if (ctx.dev_mask() == cpu::kDevMask) { + *this = std::move(temp); return true; + } else { +#if MXNET_USE_CUDA + *this = temp.Copy(ctx); return true; +#else + *this = std::move(temp); return true; +#endif + } +} const uint64_t kMXAPINDArrayListMagic = 0x112; @@ -771,7 +1039,16 @@ void NDArray::Load(dmlc::Stream* fi, } NDArray NDArray::Copy(Context ctx) const { - NDArray ret(shape(), ctx, true, dtype_); + NDArray ret; + if (kDefaultStorage == storage_type()) { + ret = NDArray(shape(), ctx, true, dtype_); + } else if (kUndefinedStorage != storage_type()) { + ret = NDArray(storage_type(), shape(), ctx, true, dtype_, + ptr_->aux_types, ptr_->aux_shapes, storage_shape()); + } else { + LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type=" + << ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id; + } CopyFromTo(*this, &ret); return ret; } @@ -804,6 +1081,101 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const { } } +/*! + * \brief Copy src.data()/aux_data(i) to dst->data()/aux_data(j). + */ +void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) { + if (i >= 0) { + CHECK_NE(src.storage_type(), kDefaultStorage); + } else { + CHECK(!src.is_none()) << "src dense ndarray must have been initialized"; + } + if (j >= 0) { + CHECK_NE(storage_type(), kDefaultStorage); + } else { + CHECK(!this->is_none()) << "dst dense ndarray must have been initialized"; + } + + if (src.var() == var()) { + // skip to copy to itself + LOG(WARNING) << "SyncCopyFromNDArray does not support copying to self"; + return; + } + const int src_dev_mask = src.ctx().dev_mask(); + const int dst_dev_mask = ctx().dev_mask(); + std::vector const_vars; + const_vars.push_back(src.var()); + + // get or create a dst tblob for copying src to it + // if dst is a dense format and has not been allocated, allocate memory for it + // else if dst is not initialized, allocate corresponding data blob for it + auto get_dst_data = [&](const TShape& src_shape) { + if (this->storage_type() == kDefaultStorage) { + this->ReshapeAndAlloc(src_shape); + } else if (!this->storage_initialized()) { + if (j < 0) { + this->CheckAndAllocData(src_shape); + } else { + this->CheckAndAllocAuxData(j, src_shape); + } + } + TBlob dst_data = (j >= 0? this->aux_data(j) : this->data()); + CHECK_LE(src_shape.Size(), dst_data.shape_.Size()); + return dst_data; + }; + + if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + }, this->ctx(), const_vars, {this->var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2CPU")); + } else { +#if MXNET_USE_CUDA + if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + rctx.get_stream()->Wait(); + }, this->ctx(), const_vars, {this->var()}, + FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU")); + } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + rctx.get_stream()->Wait(); + }, this->ctx(), const_vars, {this->var()}, + FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU")); + } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) { + Engine::Get()->PushSync([&](RunContext rctx) { + const TBlob src_data = (i >= 0? src.aux_data(i) : src.data()); + TBlob dst_data = get_dst_data(src_data.shape_); + ndarray::Copy(src_data, &dst_data, src.ctx(), this->ctx(), rctx); + rctx.get_stream()->Wait(); + }, this->ctx(), const_vars, {this->var()}, + src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, + 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } + // The copy operation was pushed to engine to execute. + // Need to wait here for it being completed. + // The reason for pushing the copy operation to engine + // is because when copying data from a sparse tensor + // to the current one, that sparse ndarray's storage_shape/aux_shape + // may not be ready or changed and we need to ensure + // thread safty for reading the correct shape info to allocate + // memory for the current ndarray. + WaitToRead(); +} + void NDArray::SyncCopyToCPU(void *data, size_t size) const { TShape dshape = this->shape(); CHECK_EQ(dshape.Size(), size) diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h index 2be55f50f934..b284e0378647 100644 --- a/src/ndarray/ndarray_function-inl.h +++ b/src/ndarray/ndarray_function-inl.h @@ -30,27 +30,28 @@ // macro to help specialize evaluation function #ifndef DECL_TERNARY -#define DECL_TERNARY(XPU, OP, FUN) \ - template<> \ - void Eval(const TBlob &lhs, const TBlob &mhs, \ - const TBlob &rhs, TBlob *ret, RunContext ctx) { \ - FUN(lhs, mhs, rhs, ret, ctx); \ +#define DECL_TERNARY(XPU, OP, FUN) \ + template<> \ + void Eval(const TBlob &lhs, const TBlob &mhs, \ + const TBlob &rhs, TBlob *ret, RunContext ctx) { \ + FUN(lhs, mhs, rhs, ret, ctx); \ } #endif #ifndef DECL_BINARY -#define DECL_BINARY(XPU, OP, FUN) \ - template<> \ +#define DECL_BINARY(XPU, OP, FUN) \ + template<> \ void Eval(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx) { \ - FUN(lhs, rhs, ret, ctx); \ + FUN(lhs, rhs, ret, ctx); \ } #endif #ifndef DECL_SCALAR -#define DECL_SCALAR(XPU, OP, FUN, REVERSE) \ - template<> \ - void Eval(const TBlob &lhs, const real_t &rhs, TBlob *ret, RunContext ctx) { \ - FUN(lhs, rhs, ret, ctx); \ +#define DECL_SCALAR(XPU, OP, FUN, REVERSE) \ + template<> \ + void Eval(const TBlob &lhs, const real_t &rhs, \ + TBlob *ret, RunContext ctx) { \ + FUN(lhs, rhs, ret, ctx); \ } #endif @@ -62,10 +63,11 @@ namespace mxnet { namespace ndarray { + // true implementation template -inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalBinary_(const TBlob &lhs, const TBlob &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(ret->type_flag_, lhs.type_flag_) @@ -79,10 +81,9 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs, }); } - template -inline void EvalOneHot_(const TBlob &index, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalOneHot_(const TBlob &index, const TBlob &rhs, + TBlob *ret, RunContext ctx) { LOG(INFO) << "The operator onehot_encode is deprecated; use one_hot instead."; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -99,8 +100,8 @@ inline void EvalOneHot_(const TBlob &index, const TBlob &rhs, } template -inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); // TODO(eric): support mixed type choose, i.e. int index and float rhs. @@ -116,8 +117,8 @@ inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs, } template -inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs, - TBlob *ret, RunContext ctx) { +void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); ret->get(s) @@ -127,8 +128,8 @@ inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob } template -inline void EvalScalar_(const TBlob &lhs, const real_t &rhs, - TBlob *ret, RunContext ctx) { +void EvalScalar_(const TBlob &lhs, const real_t &rhs, + TBlob *ret, RunContext ctx) { using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); CHECK_EQ(ret->type_flag_, lhs.type_flag_) @@ -148,7 +149,7 @@ inline void EvalScalar_(const TBlob &lhs, const real_t &rhs, template<> void EvalClip(const TBlob &src, const real_t &a_min, const real_t &a_max, - TBlob *ret, RunContext ctx) { + TBlob *ret, RunContext ctx) { typedef DEVICE xpu; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -163,12 +164,11 @@ void EvalClip(const TBlob &src, const real_t &a_min, const real_t &a_max } template<> -void EvalRandom( - const real_t &a, - const real_t &b, - const Resource &resource, - TBlob *ret, - RunContext ctx) { +void EvalRandom(const real_t &a, + const real_t &b, + const Resource &resource, + TBlob *ret, + RunContext ctx) { typedef DEVICE xpu; mshadow::Stream *s = ctx.get_stream(); switch (ret->type_flag_) { @@ -444,6 +444,7 @@ DECL_SCALAR(DEVICE, Plus, EvalScalar_, true) DECL_SCALAR(DEVICE, Minus, EvalScalar_, true) DECL_SCALAR(DEVICE, Mul, EvalScalar_, true) DECL_SCALAR(DEVICE, Div, EvalScalar_, true) + // for reverse seq DECL_SCALAR(DEVICE, Plus, EvalScalar_, false) DECL_SCALAR(DEVICE, Minus, EvalScalar_, false) diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index e4af86d2c824..5cea7942efa6 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -25,6 +25,7 @@ // this will be invoked by gcc and compile CPU version #include "./ndarray_function.h" #include "./ndarray_function-inl.h" +#include "../common/utils.h" namespace mxnet { namespace ndarray { @@ -44,5 +45,138 @@ void Copy(const TBlob &from, TBlob *to, } }) } + +template +void ElementwiseSumRspImpl(mshadow::Stream* s, + const std::vector& nds, + const std::vector& uniq_row_idx, + NDArray* out, + const int nthreads = 4) { +#pragma omp parallel num_threads(nthreads) + { + const size_t nnr = uniq_row_idx.size(); + const int num_threads = omp_get_num_threads(); + size_t row_block_len = (nnr + num_threads - 1) / num_threads; + const size_t row_block_start = omp_get_thread_num() * row_block_len; + if (row_block_start < nnr) { + const size_t row_block_end = std::min(row_block_start+row_block_len, nnr); + + const size_t row_length = out->data().shape_.ProdShape(1, out->data().shape_.ndim()); + auto out_values = out->data().get_with_shape( + mshadow::Shape2(out->storage_shape()[0], row_length), s); + auto out_indices = out->aux_data(rowsparse::kIdx).FlatTo1D(); + for (size_t i = row_block_start; i < row_block_end; ++i) { + out_indices[i] = uniq_row_idx[i]; + } + for (const auto& nd : nds) { + if (nd.storage_initialized()) { + const auto nd_indices = nd.aux_data(rowsparse::kIdx).FlatTo1D(); + const auto nd_values = nd.data().get_with_shape( + mshadow::Shape2(nd.storage_shape()[0], row_length), s); + const auto nd_num_rows = nd.aux_shape(rowsparse::kIdx).Size(); + const IType* nd_indices_start = &nd_indices[0]; + const IType* nd_indices_end = nd_indices_start + nd_num_rows; + const IType* row_idx_ptr = std::lower_bound(nd_indices_start, nd_indices_end, + out_indices[row_block_start]); + // skip this nd if all of its row indices are smaller than out_indices[row_block_start] + // or current row block is not covered by [*row_idx_ptr, nd_indices_end). + if (nd_indices_end == row_idx_ptr || *row_idx_ptr > out_indices[row_block_end-1]) { + continue; + } + for (size_t irow = row_block_start; + irow < row_block_end && row_idx_ptr != nd_indices_end;) { + if (out_indices[irow] == *row_idx_ptr) { + auto out_value_cur_row = out_values[irow]; + const auto offset = row_idx_ptr - nd_indices_start; + auto nd_value_cur_row = nd_values[offset]; + for (size_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) { + out_value_cur_row[j] += nd_value_cur_row[j]; + } + ++irow; + ++row_idx_ptr; + } else if (out_indices[irow] < *row_idx_ptr) { + ++irow; + } else { + ++row_idx_ptr; + } + } + } + } + } + } +} + +/*! + * \brief Given a vector of ndarrays, generate a index vector containing + * all the unique row indices of the ndarrays. + */ +template +void GetUniqueRspRowIdx(const std::vector& nds, + std::vector* uniq_row_idx) { + using namespace rowsparse; + size_t total_num_rows = 0; + for (const auto& nd : nds) { + CHECK_EQ(nd.storage_type(), kRowSparseStorage); + if (nd.storage_initialized()) { + total_num_rows += nd.aux_shape(kIdx).Size(); + } + } + + uniq_row_idx->resize(total_num_rows); + int nthreads = omp_get_max_threads(); + int offset = 0; + for (const auto& nd : nds) { + if (nd.storage_initialized()) { + const IType* nd_row_idx = nd.aux_data(kIdx).dptr(); + const int num_rows = nd.aux_shape(kIdx).Size(); +#pragma omp parallel for num_threads(nthreads) + for (int i = 0; i < num_rows; ++i) { + (*uniq_row_idx)[offset+i] = nd_row_idx[i]; + } + offset += num_rows; + } + } + + common::ParallelSort(uniq_row_idx->begin(), uniq_row_idx->end(), nthreads); + auto it = std::unique(uniq_row_idx->begin(), uniq_row_idx->end()); + uniq_row_idx->resize(it - uniq_row_idx->begin()); +} + +void ElementwiseSumRsp(mshadow::Stream* s, const std::vector& nds, NDArray* out) { + if (nds.empty()) return; + using namespace rowsparse; + CHECK_EQ(out->storage_type(), kRowSparseStorage) + << "Expected row sparse storage type (" + << out->storage_type() << " given)"; + + MSHADOW_TYPE_SWITCH(out->dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, { + std::vector uniq_row_idx; + GetUniqueRspRowIdx(nds, &uniq_row_idx); + out->CheckAndAlloc({mshadow::Shape1(uniq_row_idx.size())}); + out->data().FlatTo2D() = static_cast(0); + ElementwiseSumRspImpl(s, nds, uniq_row_idx, out, omp_get_max_threads()); + }); + }); +} + +/*! + * \brief Parallel cpu impl of elemwise sum for sparse tensors. + * Currently only support row sparse sum. + */ +template<> +void ElementwiseSum(mshadow::Stream* s, + const std::vector& nds, + NDArray* out) { + if (nds.empty()) return; + + if (nds[0].storage_type() == kRowSparseStorage) { + ElementwiseSumRsp(s, nds, out); + } else { + LOG(FATAL) << "ElementwiseSum has not been implemented for storage_type = << " + << nds[0].storage_type(); + } +} + } // namespace ndarray } // namespace mxnet diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index b1ed58db3e74..65c59185f691 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "../operator/mshadow_op.h" @@ -168,6 +169,14 @@ void ElementwiseSum(const std::vector source, TBlob *out, RunContext ctx); +/*! + * \brief Interface for parallel impl of elemwise sum for sparse matrices + */ +template +void ElementwiseSum(mshadow::Stream* s, + const std::vector& nds, + NDArray* out); + // broadcasting template void EvalBroadcast(TBlob const& src, TBlob* ret, int size, RunContext ctx); diff --git a/src/nnvm/legacy_op_util.cc b/src/nnvm/legacy_op_util.cc index 2bba5f1c3655..6e601780080b 100644 --- a/src/nnvm/legacy_op_util.cc +++ b/src/nnvm/legacy_op_util.cc @@ -60,19 +60,20 @@ class OperatorState { opr_ = opr; fwd_init_ = bwd_init_ = false; - in_data_.resize(prop->ListArguments().size()); + in_data_fwd_.resize(prop->ListArguments().size()); + in_data_bwd_.resize(prop->ListArguments().size()); out_data_.resize(prop->NumOutputs()); aux_data_.resize(prop->ListAuxiliaryStates().size()); - in_grad_.resize(in_data_.size()); + in_grad_.resize(in_data_fwd_.size()); out_grad_.resize(prop->NumVisibleOutputs()); std::vector out_grad_ptr(out_grad_.size()); for (size_t i = 0; i < out_grad_.size(); ++i) { out_grad_ptr[i] = &out_grad_[i]; } - std::vector in_data_ptr(in_data_.size()); - for (size_t i = 0; i < in_data_.size(); ++i) { - in_data_ptr[i] = &in_data_[i]; + std::vector in_data_ptr(in_data_fwd_.size()); + for (size_t i = 0; i < in_data_fwd_.size(); ++i) { + in_data_ptr[i] = &in_data_bwd_[i]; } std::vector out_data_ptr(out_data_.size()); for (size_t i = 0; i < out_data_.size(); ++i) { @@ -89,16 +90,19 @@ class OperatorState { const std::vector& req, const std::vector& outputs) { if (!fwd_init_) { - CHECK_EQ(inputs.size(), in_data_.size() + aux_data_.size()); + CHECK_EQ(inputs.size(), in_data_fwd_.size() + aux_data_.size()); CHECK_EQ(outputs.size(), out_data_.size()); - for (size_t i = 0; i < in_data_.size(); ++i) in_data_[i] = inputs[i]; + // in_data_bwd_ has the same tblobs as the ones in in_data_fwd_, except that the ones + // referred by arg_data_ptr_ will be overriden + for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_fwd_[i] = inputs[i]; + for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_bwd_[i] = inputs[i]; for (size_t i = 0; i < aux_data_.size(); ++i) { - aux_data_[i] = inputs[i + in_data_.size()]; + aux_data_[i] = inputs[i + in_data_fwd_.size()]; } for (size_t i = 0; i < out_data_.size(); ++i) out_data_[i] = outputs[i]; fwd_init_ = true; } - opr_->Forward(ctx, in_data_, req, out_data_, aux_data_); + opr_->Forward(ctx, in_data_fwd_, req, out_data_, aux_data_); } void Backward(const OpContext &ctx, @@ -108,6 +112,8 @@ class OperatorState { if (!bwd_init_) { CHECK(fwd_init_); CHECK_EQ(arg_data_ptr_.size() + aux_data_.size(), inputs.size()); + // override tblobs pointed by arg_data_ptr_ since they might not contain + // initialized data during forward pass. for (size_t i = 0; i < arg_data_ptr_.size(); ++i) { *arg_data_ptr_[i] = inputs[i]; } @@ -118,13 +124,19 @@ class OperatorState { for (size_t i = 0; i < outputs.size(); ++i) in_grad_[i] = outputs[i]; bwd_init_ = true; } - opr_->Backward(ctx, out_grad_, in_data_, out_data_, req, in_grad_, aux_data_); + opr_->Backward(ctx, out_grad_, in_data_bwd_, out_data_, req, in_grad_, aux_data_); } private: Operator *opr_; bool fwd_init_, bwd_init_; - std::vector in_data_, aux_data_, out_data_, in_grad_, out_grad_; + // input data blobs for forward and backward + // in_data_fwd_ and in_data_bwd_ will hold different tblobs when StorageFallbackOpExecutor + // performs storage fallback on a non-default input NDArray. The one in in_data_fwd_ is + // generated when setting up forward executor, while the one in in_data_bwd_ is generated + // when setting up backward executor. + std::vector in_data_fwd_, in_data_bwd_; + std::vector aux_data_, out_data_, in_grad_, out_grad_; std::vector arg_data_ptr_; }; diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc index 86f47dd6163f..866b7fe619cb 100644 --- a/src/operator/batch_norm.cc +++ b/src/operator/batch_norm.cc @@ -230,7 +230,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, #pragma omp parallel for for (int channel = 0; channel < static_cast(channelCount); ++channel) { const AccReal *weight = weights.dptr(); - const AccReal w = weight ? weight[channel] : AccReal(1); + const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1); AccReal mean, invstd; if (is_train_and_not_global_stats) { mean = saveMeanDataPtr[channel]; diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu index 64f7d9373823..9a8b576a16ee 100644 --- a/src/operator/batch_norm.cu +++ b/src/operator/batch_norm.cu @@ -283,7 +283,7 @@ __global__ void BatchNormalizationUpdateOutputKernel( } // Write normalized and update the output - const AccReal gamma = weight.numElements() > 0 + const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0) ? ScalarConvert::to(weight[plane]) : ScalarConvert::to(1); const AccReal beta = bias.numElements() > 0 ? ScalarConvert::to(bias[plane]) @@ -332,7 +332,7 @@ static __global__ void BatchNormalizationBackwardKernel( invstd = VARIANCE_TO_INVSTD(tensors.runningVar[plane], eps); } - const AccReal weightVal = tensors.weight.numElements() > 0 ? + const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ? ScalarConvert::to(tensors.weight[plane]) : AccReal(1); const AccReal norm = AccReal(1) / N; diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h index 43530138b8ea..9db94a8c5986 100644 --- a/src/operator/deconvolution-inl.h +++ b/src/operator/deconvolution-inl.h @@ -256,7 +256,7 @@ class DeconvolutionOp : public Operator { if (!param_.no_bias) { // add bias, broadcast bias to dim 1: channel Tensor bias = in_data[deconv::kBias].get(s); - out += broadcast<1>(bias, out.shape_); + out += mshadow::expr::broadcast<1>(bias, out.shape_); } } diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h index 9b398f947e30..f60bb590a2e6 100644 --- a/src/operator/elemwise_op_common.h +++ b/src/operator/elemwise_op_common.h @@ -80,6 +80,42 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs, return true; } +// Only inferring output storage types from input for now +template +inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + auto deduce = [&](std::vector *vec, const char *name, AttrType& result, + bool fallback) { + auto &v = *vec; + for (size_t i = 0; i < vec->size(); ++i) { + if (v[i] == kUndefinedStorage) { + // if input type is unknown, assume it's default storage + CHECK(assign(&v[i], kDefaultStorage)); + } else if (assign(&result, v[i]) == false && fallback) { + result = kDefaultStorage; + } + } + }; + AttrType dattr = kUndefinedStorage; + deduce(in_attrs, "input", dattr, enable_fallback); + if (reverse_infer) { + LOG(FATAL) << "not implemented yet"; + } + auto write = [&](std::vector *vec, const char *name) { + for (size_t i = 0; i < vec->size(); ++i) { + CHECK(assign(&(*vec)[i], dattr)) + << "Incompatible attr in node " << attrs.name << " at " << i << "-th " + << name << ": " << "expected " << dattr << ", got " << (*vec)[i]; + } + }; + if (is_none(dattr)) dattr = kDefaultStorage; + write(out_attrs, "output"); + return true; +} + template inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, @@ -108,6 +144,18 @@ inline bool ElemwiseType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } +template +inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + // TODO(junwu): add ctx info into storage inference logic + CHECK_EQ(in_attrs->size(), static_cast(n_in)) << " in operator " << attrs.name; + CHECK_EQ(out_attrs->size(), static_cast(n_out)) << " in operator " << attrs.name; + return ElemwiseStorageAttr( + attrs, in_attrs, out_attrs); +} + // Transfer gradient and input to FGradient function struct ElemwiseGradUseIn { const char *op_name; diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h index 828930a0e405..d228e3e67d03 100644 --- a/src/operator/leaky_relu-inl.h +++ b/src/operator/leaky_relu-inl.h @@ -111,7 +111,7 @@ class LeakyReLUOp : public Operator { case leakyrelu::kPReLU: { weight = in_data[leakyrelu::kGamma].get(s); Assign(out, req[leakyrelu::kOut], - F(data, broadcast<1>(weight, out.shape_))); + F(data, mshadow::expr::broadcast<1>(weight, out.shape_))); break; } case leakyrelu::kRReLU: { @@ -177,7 +177,8 @@ class LeakyReLUOp : public Operator { weight = in_data[leakyrelu::kGamma].get(s); grad_weight = in_grad[leakyrelu::kGamma].get(s); grad_weight = sumall_except_dim<1>(F(data) * grad); - gdata = F(data, broadcast<1>(weight, data.shape_)) * grad; + gdata = F(data, mshadow::expr::broadcast<1>(weight, data.shape_)) + * grad; break; } case leakyrelu::kRReLU: { diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index 0af7d026d9d5..3162ab6b7b16 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -25,8 +25,12 @@ #ifndef MXNET_OPERATOR_MXNET_OP_H_ #define MXNET_OPERATOR_MXNET_OP_H_ +#include #include #include +#ifdef __CUDACC__ +#include "../common/cuda_utils.h" +#endif // __CUDACC__ namespace mxnet { namespace op { @@ -40,6 +44,8 @@ const float PI = 3.14159265358979323846; using std::isnan; #endif +template +int get_num_threads(const int N); #ifdef __CUDACC__ #define CUDA_KERNEL_LOOP(i, n) \ @@ -47,6 +53,13 @@ using std::isnan; i < (n); \ i += blockDim.x * gridDim.x) +inline cudaDeviceProp cuda_get_device_prop() { + int device; + CUDA_CALL(cudaGetDevice(&device)); + cudaDeviceProp deviceProp; + CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device)); + return deviceProp; +} /*! * \brief Get the number of blocks for cuda kernel given N @@ -55,8 +68,18 @@ inline int cuda_get_num_blocks(const int N) { using namespace mshadow::cuda; return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); } + +template<> +inline int get_num_threads(const int N) { + using namespace mshadow::cuda; + return kBaseThreadNum * cuda_get_num_blocks(N); +} #endif // __CUDACC__ +template<> +inline int get_num_threads(const int N) { + return omp_get_max_threads(); +} /*! \brief operator request type switch */ #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...) \ @@ -216,7 +239,6 @@ __global__ void mxnet_generic_kernel(int N, Args... args) { } } - template struct Kernel { template diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h index 2d46bd3230ce..dc53e1a7d232 100644 --- a/src/operator/operator_common.h +++ b/src/operator/operator_common.h @@ -29,12 +29,15 @@ #include #include #include +#include +#include #include #include #include #include #include #include "../common/cuda_utils.h" +#include "../common/utils.h" namespace mxnet { namespace op { @@ -125,6 +128,19 @@ inline std::string type_string(const int& x) { return "unknown"; } +/*! \brief get string representation of storage_type */ +inline std::string stype_string(const int& x) { + switch (x) { + case kDefaultStorage: + return "default"; + case kCSRStorage: + return "csr"; + case kRowSparseStorage: + return "row_sparse"; + } + return "unknown"; +} + /*! * \brief Assign x to y. Checks for compatiblity when y is not empty. * Allow missing dim in both x and y (as 0). @@ -201,6 +217,24 @@ inline bool type_assign(int *y, const int& x) { } \ } +/*! + * \brief macro assign type to out if out is unknown (-1) otherwise check consistency + * Use macro so we can see the error file more clearly + * \param type_array the storage type array to store the result + * \param index the index of in the array + * \param type the inferred storage type + */ +#define STORAGE_TYPE_ASSIGN_CHECK(type_array, index, type) \ + { \ + if (!type_assign(&(type_array)[index], type)) { \ + std::ostringstream os; \ + os << "Storage type inconsistent, Provided=" \ + << stype_string((type_array)[index]) << ',' \ + << " inferred storage type=" << stype_string(type); \ + throw ::mxnet::op::InferTypeError(os.str(), index); \ + } \ + } + // helper macro to implement bind dispatch #if MXNET_USE_CUDA #define DO_BIND_DISPATCH(Method, ...) \ @@ -333,6 +367,54 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) { attrs->parsed = std::move(param); } +/*! \brief Perform storage fallback to invoke fcompute. + * \param attrs attributes of the operator + * \param ctx operator context + * \param inputs inputs of fcompute + * \param req req of fcompute + * \param outputs outputs of fcompute + * \param fcompute + * \param fname name of the operator + * \param mutate_idx the indices of mutable inputs + */ +template +void FCompExFallback(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs, + FCompute fcompute, + const std::string& fname, + std::vector mutate_idx = {}) { + using namespace mxnet::common; + std::vector in_blobs, out_blobs; + std::vector pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src; + // mapping from index in input_blobs to index in pre_temp_dst + std::unordered_map in_temp_idx_map; + SetupDefaultBlobs(inputs, &in_blobs, &pre_temp_src, &pre_temp_dst, &in_temp_idx_map); + SetupDefaultBlobs(outputs, &out_blobs, &post_temp_dst, &post_temp_src); + for (const auto idx : mutate_idx) { + auto map_iter = in_temp_idx_map.find(idx); + if (map_iter != in_temp_idx_map.end()) { + post_temp_src.push_back(pre_temp_dst[map_iter->second]); + post_temp_dst.push_back(inputs[idx]); + } + } + CastNonDefaultStorage(pre_temp_src, pre_temp_dst, ctx, true); + fcompute(attrs, ctx, in_blobs, req, out_blobs); + CastNonDefaultStorage(post_temp_src, post_temp_dst, ctx, true); +} + +#define CHECK_RSP_ALL_ROWS_NON_ZERO(rsp, func, param) \ + { \ + CHECK(rsp.storage_shape()[0] == rsp.shape()[0]) << func \ + << " for RowSparse " << param << " is only implemented for " \ + << "RowSparse " << param << " with all rows containing non-zeros. " \ + << "Expects " << param << ".values.shape[0] (" << rsp.storage_shape()[0] \ + << ") == " << param << ".shape[0] (" << rsp.shape()[0] << ")."; \ + } + + } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_OPERATOR_COMMON_H_ diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 70759b15251a..28707aae4ce8 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -36,6 +36,7 @@ #include "./mshadow_op.h" #include "./elemwise_op_common.h" #include "mxnet_op.h" +#include "./tensor/init_op.h" namespace mxnet { namespace op { @@ -102,6 +103,167 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs, }); } +/*! \brief kernel for sparse sgd + */ +template +struct SGDDnsRspKernel { + // DType is the output data type + // IType is row sparse idx type + // i is the ith row in row sparse gradient + template + MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight, + const IType* grad_idx, const DType *grad_val, + const DType clip_gradient, const DType lr, + const DType wd, const DType rescale_grad) { + for (index_t j = 0; j < row_length; j++) { + index_t data_i = grad_idx[i] * row_length + j; + index_t grad_i = i * row_length + j; + if (clip_gradient >= 0.0f) { + KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] - + (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient)); + } else { + KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] - + (lr * rescale_grad) * grad_val[grad_i]); + } + } + } +}; + +template +inline void SGDUpdateDnsRspImpl(const SGDParam& param, + const OpContext &ctx, + const TBlob& weight, + const NDArray& grad, + const OpReqType& req, + TBlob *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow_op; + using namespace mxnet_op; + Stream* s = ctx.get_stream(); + CHECK_EQ(grad.storage_type(), kRowSparseStorage); + // if gradients are zeros, no weights are updated + if (!grad.storage_initialized() || req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update"; + CHECK_GT(weight.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.dptr(); + IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr(); + DType* grad_val = grad.data().dptr(); + index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0]; + auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + Kernel, xpu>::Launch(s, num_rows, row_length, + out->dptr(), weight_data, grad_idx, grad_val, + static_cast(param.clip_gradient), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); + }); +} + +/*! \brief kernel for sparse sgd + */ +template +struct SGDRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, const index_t num_cols, DType* out, const DType* weight, + const DType *grad, const DType clip_gradient, const DType lr, + const DType wd, const DType rescale_grad) { + bool contains_non_zeros = false; + index_t j = 0; + index_t offset = i * num_cols; + for (; j < num_cols; ++j) { + if (grad[offset + j] != 0) { + contains_non_zeros = true; + break; + } + } + if (!contains_non_zeros) return; + const DType rate = 1.f - lr * wd; + for (index_t j = 0; j < num_cols; j++) { + auto index = offset + j; + if (clip_gradient >= 0.0f) { + KERNEL_ASSIGN(out[index], req, rate * weight[index] - + lr * mshadow_op::clip::Map(rescale_grad * grad[index], clip_gradient)); + } else { + KERNEL_ASSIGN(out[index], req, rate * weight[index] - + lr * rescale_grad * grad[index]); + } + } + } +}; + +template +inline void SGDUpdateRspDnsImpl(const SGDParam& param, + const OpContext &ctx, + const NDArray& weight, + const TBlob& grad, + const OpReqType req, + NDArray *out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights"); + CHECK_EQ(weight.storage_type(), kRowSparseStorage); + if (req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_update"; + CHECK(weight.storage_initialized()); + Stream* s = ctx.get_stream(); + MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.data().dptr(); + DType* grad_data = grad.dptr(); + index_t num_rows = weight.aux_shape(kIdx)[0]; + auto num_cols = weight.shape().ProdShape(1, weight.shape().ndim()); + Kernel, xpu>::Launch(s, num_rows, num_cols, + out->data().dptr(), weight_data, grad_data, + static_cast(param.clip_gradient), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); +} + +template +inline void SGDUpdateRspRspImpl(const SGDParam& param, + const OpContext& ctx, + const NDArray& weight, + const NDArray& grad, + const OpReqType& req, + NDArray *out) { + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights"); + // reuse dns rsp implementation when storage_shape == shape + TBlob out_blob = out->data(); + SGDUpdateDnsRspImpl(param, ctx, weight.data(), grad, req, &out_blob); +} + +template +inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow_op; + const SGDParam& param = nnvm::get(attrs.parsed); + auto weight_stype = inputs[0].storage_type(); + auto grad_stype = inputs[1].storage_type(); + if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + SGDUpdateRspRspImpl(param, ctx, inputs[0], inputs[1], req[0], &out); + } else if (weight_stype == kRowSparseStorage && grad_stype == kDefaultStorage) { + NDArray out = outputs[0]; + SGDUpdateRspDnsImpl(param, ctx, inputs[0], inputs[1].data(), req[0], &out); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, SGDUpdate, "SGDUpdate"); + } +} + struct SGDMomParam : public dmlc::Parameter { float lr; float momentum; @@ -275,6 +437,196 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs, }); } +template +struct SGDMomDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, + DType* mom_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const DType clip_gradient, const DType momentum, + const DType lr, const DType wd, const DType rescale_grad) { + const DType rate = lr * wd; + for (index_t j = 0; j < row_length; j++) { + index_t data_i = grad_idx[i] * row_length + j; + index_t grad_i = i * row_length + j; + if (clip_gradient >= 0.0f) { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * + mshadow_op::clip::Map(rescale_grad * grad_data[grad_i], + clip_gradient); + } else { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * rescale_grad * grad_data[grad_i]; + } + KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]); + } + } +}; + +template +inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param, + const OpContext& ctx, + const TBlob& weight, + const NDArray& grad, + const TBlob& mom, + const OpReqType& req, + TBlob *out) { + using namespace mxnet_op; + using namespace rowsparse; + Stream* s = ctx.get_stream(); + if (!grad.storage_initialized() || req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update"; + CHECK_GT(weight.shape_.Size(), 0); + CHECK_GT(mom.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.dptr(); + IType* grad_idx = grad.aux_data(kIdx).dptr(); + DType* grad_val = grad.data().dptr(); + DType* mom_data = mom.dptr(); + DType* out_data = out->dptr(); + index_t num_rows = grad.aux_shape(kIdx)[0]; + auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + Kernel, xpu>::Launch(s, num_rows, row_length, + out_data, mom_data, weight_data, grad_idx, grad_val, + static_cast(param.clip_gradient), static_cast(param.momentum), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); + }); +} + +template +struct SGDMomRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t num_cols, DType* out, DType* mom, + const DType* weight, const DType *grad, + const DType clip_gradient, const DType momentum, + const DType lr, const DType wd, const DType rescale_grad) { + bool contains_non_zeros = false; + index_t j = 0; + index_t offset = i * num_cols; + for (; j < num_cols; ++j) { + if (grad[offset + j] != 0) { + contains_non_zeros = true; + break; + } + } + if (!contains_non_zeros) return; + const DType rate = lr * wd; + for (index_t j = 0; j < num_cols; j++) { + auto index = offset + j; + if (clip_gradient >= 0.0f) { + mom[index] = momentum * mom[index] - rate * weight[index] + - lr * mshadow_op::clip::Map(rescale_grad * grad[index], clip_gradient); + } else { + mom[index] = momentum * mom[index] - rate * weight[index] + - lr * rescale_grad * grad[index]; + } + KERNEL_ASSIGN(out[index], req, weight[index] + mom[index]); + } + } +}; + +template +inline void SGDMomUpdateRspDnsImpl(const SGDMomParam& param, + const OpContext &ctx, + const NDArray& weight, + const TBlob& grad, + const NDArray& mom, + const OpReqType req, + NDArray *out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights"); + Stream* s = ctx.get_stream(); + CHECK_EQ(weight.storage_type(), kRowSparseStorage); + if (req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update"; + CHECK(weight.storage_initialized()); + // fill mom with zero values if not initialized yet + if (!mom.storage_initialized()) { + NDArray mom_zeros = mom; + FillDnsZerosRspImpl(s, &mom_zeros); + } + MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + DType* weight_data = weight.data().dptr(); + DType* grad_data = grad.dptr(); + DType* mom_data = mom.data().dptr(); + index_t num_rows = weight.aux_shape(kIdx)[0]; + auto num_cols = weight.shape().ProdShape(1, weight.shape().ndim()); + Kernel, xpu>::Launch(s, num_rows, num_cols, + out->data().dptr(), mom_data, weight_data, grad_data, + static_cast(param.clip_gradient), static_cast(param.momentum), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.rescale_grad)); + }); + }); +} + + +template +inline void SGDMomUpdateRspRspRspImpl(const SGDMomParam& param, + const OpContext& ctx, + const NDArray& weight, + const NDArray& grad, + const NDArray& mom, + const OpReqType& req, + NDArray *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights"); + Stream* s = ctx.get_stream(); + // fill mom with zero values in order to reuse the sgd mom dns impl + if (!mom.storage_initialized()) { + NDArray mom_zeros = mom; + FillDnsZerosRspImpl(s, &mom_zeros); + } + TBlob out_blob = out->data(); + // reuse dns rsp implementation when storage_shape == shape + SGDMomUpdateDnsRspDnsImpl(param, ctx, weight.data(), grad, + mom.data(), req, &out_blob); +} + +template +inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mxnet_op; + const SGDMomParam& param = nnvm::get(attrs.parsed); + auto &weight = inputs[0]; + auto &grad = inputs[1]; + auto &mom = inputs[2]; + auto weight_stype = weight.storage_type(); + auto grad_stype = grad.storage_type(); + auto mom_stype = mom.storage_type(); + CHECK_EQ(weight_stype, mom_stype) << "Inconsistent storage type detected between mom.stype = " + << mom_stype << " and weight.stype = " << weight_stype; + if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage && + mom_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + SGDMomUpdateRspRspRspImpl(param, ctx, weight, grad, mom, req[0], &out); + } else if (weight_stype == kRowSparseStorage && grad_stype == kDefaultStorage && + mom_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + SGDMomUpdateRspDnsImpl(param, ctx, weight, grad.data(), mom, req[0], &out); + } else { + // inputs[2] is a mutable input + FCompExFallback(attrs, ctx, inputs, req, outputs, + SGDMomUpdate, "SGDMomUpdate", {2}); + } +} + struct AdamParam : public dmlc::Parameter { float lr; float beta1; @@ -348,6 +700,147 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, }); } +/*! + * Note: this kernel performs sparse adam update. For each row-slice in row_sparse + * gradient, it finds the corresponding elements in weight, mean and var and performs + * the update. + * The kernel assumes dense weight/mean/var, and row_sparse gradient + */ +template +struct AdamDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data, + DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2, + const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) { + using nnvm::dim_t; + using namespace mshadow_op; + const dim_t row_offset = grad_idx[i] * row_length; + for (dim_t j = 0; j < row_length; j++) { + // index in data/mean/var + const dim_t data_i = row_offset + j; + // index in grad + const dim_t grad_i = i * row_length + j; + const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd; + if (clip_gradient >= 0.0f) { + mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * + clip::Map(grad_rescaled, clip_gradient); + var_data[data_i] = beta2 * var_data[data_i] + (1.f - beta2) * square::Map( + clip::Map(grad_rescaled, clip_gradient)); + } else { + mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled; + var_data[data_i] = beta2 * var_data[data_i] + + (1.f - beta2) * grad_rescaled * grad_rescaled; + } + KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] / + (square_root::Map(var_data[data_i]) + epsilon)); + } + } +}; + + +template +inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param, + const OpContext& ctx, + const TBlob& weight, + const NDArray& grad, + const TBlob& mean, + const TBlob& var, + const OpReqType& req, + TBlob *out) { + using namespace mxnet_op; + using namespace rowsparse; + Stream* s = ctx.get_stream(); + if (!grad.storage_initialized() || req == kNullOp) return; + CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update"; + CHECK_GT(weight.shape_.Size(), 0); + CHECK_GT(mean.shape_.Size(), 0); + CHECK_GT(var.shape_.Size(), 0); + + MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + const DType* weight_data = weight.dptr(); + const IType* grad_idx = grad.aux_data(kIdx).dptr(); + const DType* grad_val = grad.data().dptr(); + DType* mean_data = mean.dptr(); + DType* var_data = var.dptr(); + DType* out_data = out->dptr(); + nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0]; + const auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + Kernel, xpu>::Launch(s, num_rows, row_length, + out_data, mean_data, var_data, weight_data, grad_idx, grad_val, + static_cast(param.clip_gradient), static_cast(param.beta1), + static_cast(param.beta2), static_cast(param.lr), + static_cast(param.wd), static_cast(param.epsilon), + static_cast(param.rescale_grad)); + }); + }); + }); +} + +template +inline void AdamUpdateRspRspRspImpl(const AdamParam& param, + const OpContext& ctx, + const NDArray& weight, + const NDArray& grad, + const NDArray& mean, + const NDArray& var, + const OpReqType& req, + NDArray *out) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mxnet_op; + using namespace rowsparse; + CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "AdamUpdate", "weights"); + Stream* s = ctx.get_stream(); + // fill mean and variance with zero values in order to reuse the sgd mom dns impl + if (!mean.storage_initialized()) { + NDArray mean_zeros = mean; + FillDnsZerosRspImpl(s, &mean_zeros); + } + if (!var.storage_initialized()) { + NDArray var_zeros = var; + FillDnsZerosRspImpl(s, &var_zeros); + } + TBlob out_blob = out->data(); + // reuse dns rsp implementation when storage_shape == shape + AdamUpdateDnsRspDnsImpl(param, ctx, weight.data(), grad, mean.data(), + var.data(), req, &out_blob); +} + + +template +inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const AdamParam& param = nnvm::get(attrs.parsed); + mshadow::Stream* s = ctx.get_stream(); + const auto weight_stype = inputs[0].storage_type(); + const auto grad_stype = inputs[1].storage_type(); + const auto mean_stype = inputs[2].storage_type(); + const auto var_stype = inputs[3].storage_type(); + + const auto out_stype = outputs[0].storage_type(); + CHECK_EQ(mean_stype, weight_stype) << "Inconsistent storage type detected between " + << " mean.stype = " << mean_stype << " and weight.stype = " << weight_stype; + CHECK_EQ(var_stype, weight_stype) << "Inconsistent storage type detected between " + << " var.stype = " << var_stype << " and weight.stype = " << weight_stype; + if (weight_stype == kRowSparseStorage && mean_stype == kRowSparseStorage && + var_stype == kRowSparseStorage && grad_stype == kRowSparseStorage && + out_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + AdamUpdateRspRspRspImpl(param, ctx, inputs[0], inputs[1], inputs[2], + inputs[3], req[0], &out); + } else { + LOG(FATAL) << "Unexpected storage types: weight.stype = " << weight_stype + << ", var.stype = " << var_stype << ", mean.stype = " << mean_stype + << ", grad.stype = " << grad_stype; + } +} + // This RMSProp code follows the version in // http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) // by Alex Graves, 2013. diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index b26c333edaef..9b2b088c5095 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -40,6 +40,9 @@ It updates the weights using:: weight = weight - learning_rate * gradient +If weight is stored with `row_sparse` storage type, +only the row slices whose indices appear in grad.indices are updated. + )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(1) @@ -47,6 +50,7 @@ It updates the weights using:: .set_attr("FInferShape", ElemwiseShape<2, 1>) .set_attr("FInferType", ElemwiseType<2, 1>) .set_attr("FCompute", SGDUpdate) +.set_attr("FComputeEx", SGDUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_arguments(SGDParam::__FIELDS__()); @@ -70,6 +74,9 @@ It updates the weights using:: Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. +If weights are stored with `row_sparse` storage type, +only the row slices whose indices appear in grad.indices are updated (for both weight and momentum). + )code" ADD_FILELINE) .set_num_inputs(3) .set_num_outputs(1) @@ -81,6 +88,7 @@ Where the parameter ``momentum`` is the decay rate of momentum estimates at each return std::vector{2}; }) .set_attr("FCompute", SGDMomUpdate) +.set_attr("FComputeEx", SGDMomUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_argument("mom", "NDArray-or-Symbol", "Momentum") @@ -152,6 +160,7 @@ It updates the weights using:: return std::vector{2, 3}; }) .set_attr("FCompute", AdamUpdate) +.set_attr("FComputeEx", AdamUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") .add_argument("grad", "NDArray-or-Symbol", "Gradient") .add_argument("mean", "NDArray-or-Symbol", "Moving mean") diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 0e74e303dbc9..fe45f4be8c66 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -28,10 +28,12 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(sgd_update) -.set_attr("FCompute", SGDUpdate); +.set_attr("FCompute", SGDUpdate) +.set_attr("FComputeEx", SGDUpdateEx); NNVM_REGISTER_OP(sgd_mom_update) -.set_attr("FCompute", SGDMomUpdate); +.set_attr("FCompute", SGDMomUpdate) +.set_attr("FComputeEx", SGDMomUpdateEx); NNVM_REGISTER_OP(mp_sgd_update) .set_attr("FCompute", MP_SGDUpdate); @@ -40,7 +42,8 @@ NNVM_REGISTER_OP(mp_sgd_mom_update) .set_attr("FCompute", MP_SGDMomUpdate); NNVM_REGISTER_OP(adam_update) -.set_attr("FCompute", AdamUpdate); +.set_attr("FCompute", AdamUpdate) +.set_attr("FComputeEx", AdamUpdateEx); NNVM_REGISTER_OP(rmsprop_update) .set_attr("FCompute", RMSPropUpdate); diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc index 8d87d2b99d14..363163cbc697 100644 --- a/src/operator/random/sample_op.cc +++ b/src/operator/random/sample_op.cc @@ -61,7 +61,8 @@ Example:: [ 0.54488319, 0.84725171]] )code" ADD_FILELINE) -.set_attr("FCompute", SampleUniform_); +.set_attr("FCompute", SampleUniform_) +.set_attr("FComputeEx", SampleUniformEx_); // Add "normal" alias for backward compatibility MXNET_OPERATOR_REGISTER_SAMPLE(random_normal, SampleNormalParam) @@ -78,7 +79,8 @@ Example:: random_normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478], [-1.23474145, 1.55807114]] )code" ADD_FILELINE) -.set_attr("FCompute", SampleNormal_); +.set_attr("FCompute", SampleNormal_) +.set_attr("FComputeEx", SampleNormalEx_); MXNET_OPERATOR_REGISTER_SAMPLE(random_gamma, SampleGammaParam) .add_alias("_sample_gamma") @@ -91,7 +93,8 @@ Example:: random_gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984, 3.37695289], [ 3.91697288, 3.65933681]] )code" ADD_FILELINE) -.set_attr("FCompute", SampleGamma_); +.set_attr("FCompute", SampleGamma_) +.set_attr("FComputeEx", SampleGammaEx_); MXNET_OPERATOR_REGISTER_SAMPLE(random_exponential, SampleExponentialParam) .add_alias("_sample_exponential") diff --git a/src/operator/random/sample_op.cu b/src/operator/random/sample_op.cu index 0d4b2e5a8270..7bdb9faf334e 100644 --- a/src/operator/random/sample_op.cu +++ b/src/operator/random/sample_op.cu @@ -28,21 +28,20 @@ namespace op { // GPU versions of uniform and normal distribution. template<> -void SampleUniform_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleUniformDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* output) { using namespace mxnet::op; using namespace mshadow::expr; typedef gpu xpu; mshadow::Stream *s = ctx.get_stream(); const SampleUniformParam& param = nnvm::get(attrs.parsed); mshadow::Random *prnd = ctx.requested[0].get_random(s); - if (outputs[0].type_flag_ != mshadow::kFloat32) { - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (output->type_flag_ != mshadow::kFloat32) { + MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, { // Not float32: use workspace and copy to output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); mshadow::Tensor workspace = ctx.requested[1].get_space_typed (mshadow::Shape1(out.shape_.Size()), s); @@ -51,27 +50,36 @@ void SampleUniform_(const nnvm::NodeAttrs& attrs, }); } else { // float32: write directly into output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); prnd->SampleUniform(&out, param.low, param.high); } } template<> -void SampleNormal_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleUniform_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleUniformDnsImpl(attrs, ctx, req[0], &out); +} + +template<> +void SampleNormalDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* output) { using namespace mxnet::op; using namespace mshadow::expr; typedef gpu xpu; mshadow::Stream *s = ctx.get_stream(); const SampleNormalParam& param = nnvm::get(attrs.parsed); mshadow::Random *prnd = ctx.requested[0].get_random(s); - if (outputs[0].type_flag_ != mshadow::kFloat32) { - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (output->type_flag_ != mshadow::kFloat32) { + MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, { // Not float32: use workspace and copy to output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); mshadow::Tensor workspace = ctx.requested[1].get_space_typed (mshadow::Shape1(out.shape_.Size()), s); @@ -80,16 +88,28 @@ void SampleNormal_(const nnvm::NodeAttrs& attrs, }); } else { // float32: write directly into output - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); prnd->SampleGaussian(&out, param.loc, param.scale); } } +template<> +void SampleNormal_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleNormalDnsImpl(attrs, ctx, req[0], &out); +} + NNVM_REGISTER_OP(random_uniform) -.set_attr("FCompute", SampleUniform_); +.set_attr("FCompute", SampleUniform_) +.set_attr("FComputeEx", SampleUniformEx_); NNVM_REGISTER_OP(random_normal) -.set_attr("FCompute", SampleNormal_); +.set_attr("FCompute", SampleNormal_) +.set_attr("FComputeEx", SampleNormalEx_); } // namespace op } // namespace mxnet diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h index a1a6a2345b1b..0cd3f6bc2efb 100644 --- a/src/operator/random/sample_op.h +++ b/src/operator/random/sample_op.h @@ -232,29 +232,75 @@ struct SampleGenNegBinomialParam : public dmlc::Parameter; + template -void SampleUniform_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleComputeEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs, + FSampleCompute fcomp) { + NDArray output = outputs[0]; + mshadow::Stream *s = ctx.get_stream(); + if (output.storage_type() == kRowSparseStorage) { + // indices + nnvm::dim_t nnr = output.shape()[0]; + output.CheckAndAlloc({mshadow::Shape1(nnr)}); + PopulateFullIdxRspImpl(s, &output); + // data + TBlob out_blob = output.data(); + fcomp(attrs, ctx, req[0], &out_blob); + } else { + LOG(FATAL) << "Unexpected storage type for SampleComputeEx_: " + << output.storage_type(); + } +} + +template +void SampleUniformDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* output) { using namespace mxnet::op; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); const SampleUniformParam& param = nnvm::get(attrs.parsed); - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + MSHADOW_REAL_TYPE_SWITCH(output->type_flag_, DType, { mshadow::Random *prnd = ctx.requested[0].get_random(s); - mshadow::Tensor out = outputs[0].FlatTo2D(s); + mshadow::Tensor out = output->FlatTo2D(s); prnd->SampleUniform(&out, param.low, param.high); }); } template -void SampleNormal_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +void SampleUniform_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleUniformDnsImpl(attrs, ctx, req[0], &out); +} + + +template +void SampleUniformEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + SampleComputeEx_(attrs, ctx, inputs, req, outputs, SampleUniformDnsImpl); +} + +template +void SampleNormalDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* outputs) { using namespace mxnet::op; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -268,11 +314,29 @@ void SampleNormal_(const nnvm::NodeAttrs& attrs, } template -void SampleGamma_(const nnvm::NodeAttrs& attrs, +void SampleNormal_(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + TBlob out = outputs[0]; + SampleNormalDnsImpl(attrs, ctx, req[0], &out); +} + +template +void SampleNormalEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + SampleComputeEx_(attrs, ctx, inputs, req, outputs, SampleNormalDnsImpl); +} + +template +void SampleGammaDnsImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const OpReqType& req, + TBlob* outputs) { using namespace mxnet::op; using namespace mshadow::expr; mshadow::Stream *s = ctx.get_stream(); @@ -286,6 +350,25 @@ void SampleGamma_(const nnvm::NodeAttrs& attrs, }); } +template +void SampleGamma_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TBlob out = outputs[0]; + SampleGammaDnsImpl(attrs, ctx, req[0], &out); +} + +template +void SampleGammaEx_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + SampleComputeEx_(attrs, ctx, inputs, req, outputs, SampleGammaDnsImpl); +} + template void SampleExponential_(const nnvm::NodeAttrs& attrs, const OpContext& ctx, diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh new file mode 100644 index 000000000000..afef53e979ea --- /dev/null +++ b/src/operator/tensor/cast_storage-inl.cuh @@ -0,0 +1,589 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file cast_storage-inl.cuh + * \brief implementation of cast_storage op on GPU + */ +#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_ +#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_ + +#include +#include +#include +#include +#include "./util/tensor_util-inl.cuh" + +namespace mxnet { +namespace op { + +/*! + * \brief GPU Kernel for filling the value array of the rsp tensor. + * Parallelized by rsp tensor elements: 1 thread/element + */ +struct CastDnsRspValsKernel { + /*! + * \brief + * \param tid global thread id + * \param rsp_val value array of rsp tensor to store data + * \param row_idx indices of non-zero rows + * \param dns dense matrix data + * \param nnr number of non-zero rows + * \param row_length number of elements per row + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* rsp_val, + const RType* row_idx, + const DType* dns, + const nnvm::dim_t nnr, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + if (tid < nnr*row_length) { + const dim_t row_id = tid / row_length; + const dim_t row_el = tid % row_length; + const dim_t dns_idx = row_idx[row_id] * row_length + row_el; + rsp_val[tid] = dns[dns_idx]; + } + } +}; + +template +inline mshadow::Tensor AllocateTempDataForCast(const OpContext& op_ctx, + const mshadow::Shape& shape) { + Resource rsc = ResourceManager::Get()->Request(op_ctx.run_ctx.ctx, + ResourceRequest(ResourceRequest::kTempSpace)); + mshadow::Stream *stream = op_ctx.run_ctx.get_stream(); + return rsc.get_space_typed(shape, stream); +}; + +/*! + * \brief GPU implementation of casting a dns tensor to rsp type. + */ +inline void CastStorageDnsRspImpl(const OpContext& ctx, + const gpu& gpu_dev, + const TBlob& dns, + NDArray* rsp) { + CHECK(rsp != nullptr); + CHECK_EQ(rsp->storage_type(), kRowSparseStorage); + CHECK_EQ(dns.shape_, rsp->shape()); + using mshadow::Shape1; + using mxnet_op::Kernel; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, { // row idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim()); + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum; + const dim_t min_num_warps = 512; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32"; + } + // Determine temporary device storage requirements + dim_t* row_flg = NULL; + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg, + row_flg, + num_rows, + mshadow::Stream::GetStream(s)); + + // Allocate temp storage for marking non-zero rows and for cub's prefix sum + auto workspace = AllocateTempDataForCast(ctx, Shape1(num_rows*sizeof(dim_t) + + temp_storage_bytes)); + row_flg = reinterpret_cast(workspace.dptr_); + d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t); + + // Mark non-zero rows as 'one' in row_flg + // Different kernel versions are optimized for different matrix instances + // (1) 'Thread kernel' (one thread computing one row) + // (2) 'Warp kernel' (one warp computing one row) + // (3) 'Block kernel' (one thread block computing one row) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = num_rows; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + break; + case 2: + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + break; + case 3: + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + break; + default: + if (row_length < threads_per_warp) { + num_threads = num_rows; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + } else if (row_length < threads_per_block || num_rows > min_num_warps) { + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + } else { + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + row_flg, dns.dptr(), num_rows, row_length); + } + break; + } + // Compute non-zero row indices through inclusive prefix sum + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg, + row_flg, + num_rows, + mshadow::Stream::GetStream(s)); + + // Get total number of non-zero rows from device + dim_t nnr = 0; + CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows-1], sizeof(dim_t), cudaMemcpyDeviceToHost)); + + // Allocate rsp tensor row index array and fill + rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr)); + if (0 == nnr) return; + RType* row_idx = rsp->aux_data(rowsparse::kIdx).dptr(); + num_threads = num_rows; + Kernel::Launch(s, num_threads, + row_idx, row_flg, num_rows); + + // Construct shape of rsp tensor data, allocate, and fill + auto storage_shape = dns.shape_; + storage_shape[0] = nnr; + rsp->CheckAndAllocData(storage_shape); + num_threads = nnr * row_length; + Kernel::Launch(s, num_threads, + rsp->data().dptr(), row_idx, dns.dptr(), nnr, row_length); + }); + }); +} + +/*! + * \brief Thread kernel for initializing the indptr in a csr matrix. + * Parallelized by matrix rows: 1 thread/row + */ +struct CastDnsCsrIndPtrThreadKernel { + /*! + * \brief + * \param tid global thread id + * \param indptr index pointer array of the csr matrix + * \param dns dense matrix + * \param num_rows number of rows of the dense matrix + * \param num_cols number of columns of the dense matrix + */ + template + __device__ __forceinline__ static void Map(int tid, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + if (tid == 0) { + indptr[tid] = 0; + } + if (tid < num_rows) { + dim_t nnz = 0; + const dim_t offset = tid * num_cols; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + nnz++; + } + } + indptr[tid+1] = nnz; + } + } +}; + +/*! + * \brief Thread kernel for initializing the col_idx and value array of the csr matrix. + * Parallelized by matrix rows: 1 thread/row + */ +struct CastDnsCsrColIdxAndValsThreadKernel { + /*! + * \brief + * \param tid global thread id + * \param val data array of the csr matrix + * \param col_idx column index array of the csr matrix + * \param indptr index pointer array of the csr matrix + * \param dns dense matrix + * \param num_rows number of rows of the dense matrix + * \param num_cols number of columns of the dense matrix + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + if (tid < num_rows) { + const dim_t offset = tid * num_cols; + dim_t k = indptr[tid]; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + val[k] = dns[offset+j]; + col_idx[k] = j; + ++k; + } + } + } + } +}; + +/*! + * \brief Warp kernel for initializing the indptr in a csr matrix. + * Parallelized by matrix rows: 1 warp/row + */ +struct CastDnsCsrIndPtrWarpKernel { + template + __device__ __forceinline__ static void Map(int tid, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + typedef cub::WarpReduce WarpReduce; + const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32; + __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block]; + + if (tid == 0) { + indptr[tid] = 0; + } + const dim_t warp_id = tid / 32; // global warp id + const dim_t warp_lane = threadIdx.x / 32; // local warp id within thread block + const dim_t lane = tid & (32-1); // local thread id within warp + if (warp_id < num_rows) { + dim_t lane_nnz = 0; + const dim_t offset = warp_id * num_cols; + for (dim_t j = lane; j < num_cols; j+=32) { + if (dns[offset+j] != 0) { + lane_nnz++; + } + } + dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz); + if (lane == 0) { + indptr[warp_id+1] = aggr; + } + } + } +}; + +/*! + * \brief Warp kernel for initializing the col_idx and value array of the csr matrix. + * Parallelized by matrix rows: 1 warp/row + */ +struct CastDnsCsrColIdxAndValsWarpKernel { + template + __device__ __forceinline__ static void Map(int tid, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + typedef cub::WarpScan WarpScan; + const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32; + __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block]; + __shared__ volatile dim_t warp_nnz[warps_per_block]; + + const dim_t warp_id = tid / 32; // global warp id + const dim_t warp_lane = threadIdx.x / 32; // local warp id within thread block + const dim_t lane = tid & (32-1); // local thread id within warp + if (warp_id < num_rows) { + const dim_t offset = warp_id * num_cols; + dim_t k = indptr[warp_id]; + dim_t nnz; + for (dim_t j = lane; j < num_cols+lane; j+=32) { + nnz = 0; + if (j < num_cols) { + if (dns[offset+j] != 0) { + nnz++; + } + } + if (lane == 31) { + warp_nnz[warp_lane] = nnz; + } + // Compute index each thread has to write to + WarpScan(temp_storage[warp_lane]).ExclusiveSum(nnz, nnz); + if (j < num_cols) { + if (dns[offset+j] != 0) { + val[k+nnz] = dns[offset+j]; + col_idx[k+nnz] = j; + } + } + if (lane == 31) { + warp_nnz[warp_lane] += nnz; + } + __syncwarp(); + k += warp_nnz[warp_lane]; + } + } + } +}; + +/*! + * \brief Block kernel for initializing the indptr in a csr matrix. + * Parallelized by matrix rows: 1 threadBlock/row + */ +struct CastDnsCsrIndPtrBlockKernel { + template + __device__ __forceinline__ static void Map(int tid, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using mshadow::cuda::kBaseThreadNum; + using nnvm::dim_t; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + if (tid == 0) { + indptr[tid] = 0; + } + if (blockIdx.x < num_rows) { + dim_t lane_nnz = 0; + const dim_t offset = blockIdx.x * num_cols; + for (dim_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) { + if (dns[offset+j] != 0) { + lane_nnz++; + } + } + dim_t aggr = BlockReduce(temp_storage).Sum(lane_nnz); + if (threadIdx.x == 0) { + indptr[blockIdx.x+1] = aggr; + } + } + } +}; + +/*! + * \brief Block kernel for initializing the col_idx and value array of the csr matrix. + * Parallelized by matrix rows: 1 threadBlock/row + */ +struct CastDnsCsrColIdxAndValsBlockKernel { + template + __device__ __forceinline__ static void Map(int tid, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using mshadow::cuda::kBaseThreadNum; + using nnvm::dim_t; + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + __shared__ volatile dim_t block_nnz; + + if (blockIdx.x < num_rows) { + const dim_t offset = blockIdx.x * num_cols; + dim_t k = indptr[blockIdx.x]; + dim_t nnz; + for (dim_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) { + nnz = 0; + if (j < num_cols) { + if (dns[offset+j] != 0) { + nnz++; + } + } + if (threadIdx.x == kBaseThreadNum-1) { + block_nnz = nnz; + } + // Compute index each thread has to write to + BlockScan(temp_storage).ExclusiveSum(nnz, nnz); + if (j < num_cols) { + if (dns[offset+j] != 0) { + val[k+nnz] = dns[offset+j]; + col_idx[k+nnz] = j; + } + } + if (threadIdx.x == kBaseThreadNum-1) { + block_nnz += nnz; + } + __syncthreads(); + k += block_nnz; + } + } + } +}; + +/*! + * \brief GPU implementation of casting a dense matrix to csr type. + */ +inline void CastStorageDnsCsrImpl(const OpContext& ctx, + const gpu& gpu_dev, + const TBlob& dns, + NDArray* csr) { + CHECK(csr != nullptr); + CHECK_EQ(csr->storage_type(), kCSRStorage); + CHECK_EQ(dns.shape_.ndim(), 2); + CHECK_EQ(dns.shape_, csr->shape()); + using mshadow::Shape1; + using mxnet_op::Kernel; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col_idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t num_cols = dns.shape_[1]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum; + const dim_t min_num_warps = 512; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "CastStorageDnsCsrImpl GPU kernels expect warpSize=32"; + } + csr->CheckAndAllocAuxData(csr::kIndPtr, Shape1(num_rows+1)); + IType* indptr = csr->aux_data(csr::kIndPtr).dptr(); + DType* dns_data = dns.dptr(); + + // Different kernel versions are optimized for different matrix instances + // (1) 'Thread kernel' (one thread computing one row) + // (2) 'Warp kernel' (one warp computing one row) + // (3) 'Block kernel' (one thread block computing one row) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = num_rows; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + break; + case 2: + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + break; + case 3: + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + break; + default: + if (num_cols < threads_per_warp) { + num_threads = num_rows; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + } else if (num_cols < threads_per_block || num_rows > min_num_warps) { + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + } else { + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + } + break; + } + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + indptr, + indptr, + num_rows+1, + mshadow::Stream::GetStream(s)); + + // Allocate temporary storage + auto workspace = AllocateTempDataForCast(ctx, Shape1(temp_storage_bytes)); + + d_temp_storage = workspace.dptr_; + + // Compute indptr through inclusive prefix sum + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + indptr, + indptr, + num_rows+1, + mshadow::Stream::GetStream(s)); + + // Receive total number of nnz values from device + IType nnz = 0; + CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost)); + + // Allocate column index array and data array of the csr matrix + csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast(nnz))); + csr->CheckAndAllocData(Shape1(static_cast(nnz))); + + // Compute and fill column index array and data array of the csr matrix + switch (kernel_version) { + case 1: + num_threads = num_rows; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + break; + case 2: + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + break; + case 3: + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + break; + default: + if (num_cols < threads_per_warp) { + num_threads = num_rows; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + } else if (num_cols < threads_per_block || num_rows > min_num_warps) { + num_threads = num_rows * threads_per_warp; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + } else { + num_threads = num_rows * threads_per_block; + Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + } + break; + } + }); + }); + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_ diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h new file mode 100644 index 000000000000..acb30a9eff2b --- /dev/null +++ b/src/operator/tensor/cast_storage-inl.h @@ -0,0 +1,392 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cast_storage-inl.h + * \brief cast_storage implementation for dense and sparse tensors + */ +#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_ +#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_ + +#include +#include +#include +#include "../mxnet_op.h" +#include "../operator_common.h" +#ifdef __CUDACC__ +#include "./cast_storage-inl.cuh" +#endif // __CUDACC__ + + +namespace mxnet { +namespace op { + +/*! + * \brief CPU Kernel for marking row_idx of a RSP tensor per row. + */ +struct MarkRspRowIdx { + // i represents the row index of the tensor data + template + MSHADOW_CINLINE static void Map(int i, + RType* row_idx, + const DType* data, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + dim_t j = 0; + dim_t offset = i * row_length; + for (; j < row_length; ++j) { + if (data[offset+j] != 0) { + break; + } + } + if (row_length == j) { + row_idx[i] = 0; // mark as zero for zero row + } else { + row_idx[i] = 1; // mark as one for non-zero row + } + } +}; + +/*! + * \brief CPU implementation of casting a dns tensor to rsp type. + */ +inline void CastStorageDnsRspImpl(const OpContext& ctx, + const cpu& cpu_dev, + const TBlob& dns, + NDArray* rsp) { + using namespace rowsparse; + using namespace mshadow; + using nnvm::dim_t; + CHECK(rsp != nullptr); + CHECK_EQ(rsp->storage_type(), kRowSparseStorage); + CHECK_EQ(dns.shape_, rsp->shape()); + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, { // row idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim()); + rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows)); + TBlob row_idx_blob = rsp->aux_data(kIdx); + RType* row_idx = row_idx_blob.dptr(); + dim_t num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + row_idx, dns.dptr(), row_length); + dim_t nnr = 0; + nnr = common::ParallelAccumulate(row_idx, num_rows, nnr); + rsp->set_aux_shape(kIdx, Shape1(nnr)); + if (0 == nnr) return; + auto storage_shape = dns.shape_; + storage_shape[0] = nnr; + rsp->CheckAndAllocData(storage_shape); + auto dns_data = dns.get_with_shape(Shape2(num_rows, row_length), s); + auto rsp_data = rsp->data().get_with_shape(Shape2(nnr, row_length), s); + dim_t idx = 0; + for (dim_t i = 0; i < num_rows; ++i) { + if (row_idx[i] > 0) { + row_idx[idx] = i; + Copy(rsp_data[idx], dns_data[i], s); + ++idx; + } + } + }); + }); +} + +// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element +struct CastStorageRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, + const nnvm::dim_t row_length, + const IType* idx, + const DType *data, + DType* dns) { + using nnvm::dim_t; + IType rid = idx[i]; + dim_t dns_offset = rid * row_length; + dim_t rsp_offset = i * row_length; + for (dim_t col = 0; col < row_length; col++) { + dns[dns_offset + col] = data[rsp_offset + col]; + } + } +}; + +/*! + * \brief This function assumes that the memory for dns has been allocated already + * since the shape is known at binding stage. + */ +template +void CastStorageRspDnsImpl(const OpContext& ctx, + const NDArray& rsp, + TBlob* dns) { + mshadow::Stream* s = ctx.get_stream(); + CHECK_EQ(rsp.storage_type(), kRowSparseStorage); + using nnvm::dim_t; + MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, { + // assign zeros + mxnet_op::Kernel::Launch(s, dns->Size(), dns->dptr()); + if (rsp.storage_initialized()) { + // copy over row by row + auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D(s).dptr_; + auto in_data = rsp.data().dptr(); + auto out_data = dns->dptr(); + auto shape = rsp.shape(); + const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size(); + const dim_t row_length = shape.ProdShape(1, shape.ndim()); + const dim_t num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + row_length, in_idx, in_data, out_data); + } + }); + }); +} + +/*! + * \brief CPU kernel for initializing the indptr in a csr matrix. + */ +struct FillCsrIndPtr { + /*! + * \brief + * \param i the i-th row of the dns tensor + * \param indptr the indptr of the csr tensor + * \param dns the dns tensor + * \param num_rows number of rows of the dns tensor + * \param num_cols number of columns of the dns tensor + */ + template + MSHADOW_CINLINE static void Map(int i, + IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + indptr[i+1] = 0; + const dim_t offset = i * num_cols; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + ++indptr[i+1]; + } + } + } +}; + +/*! + * \brief CPU kernel for initializing the col_idx and value array of the csr matrix. + */ +struct FillCsrColIdxAndVals { + /*! + * \brief + * \param i the i-th row of the dns tensor + * \param val value array of the csr tensor + * \param col_idx column idx array of the csr tensor + * \param indptr indptr array of the csr tensor + * \param dns dns tensor + * \param num_rows number of rows of the dns tensor + * \param num_cols number of columns of the dns tensor + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* val, + CType* col_idx, + const IType* indptr, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t offset = i * num_cols; + IType k = indptr[i]; + for (dim_t j = 0; j < num_cols; ++j) { + if (dns[offset+j] != 0) { + val[k] = dns[offset+j]; + col_idx[k] = j; + ++k; + } + } + } +}; + +/*! + * \brief CPU implementation of casting a dns matrix to csr type. + */ +inline void CastStorageDnsCsrImpl(const OpContext& ctx, + const cpu& cpu_dev, + const TBlob& dns, + NDArray* csr) { + CHECK(csr != nullptr); + CHECK_EQ(csr->storage_type(), kCSRStorage); + CHECK_EQ(dns.shape_.ndim(), 2); + CHECK_EQ(dns.shape_, csr->shape()); + using mshadow::Shape1; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col idx type + const dim_t num_rows = dns.shape_[0]; + const dim_t num_cols = dns.shape_[1]; + csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1)); + IType* indptr = csr->aux_data(csr::kIndPtr).dptr(); + DType* dns_data = dns.dptr(); + dim_t num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + indptr, dns_data, num_rows, num_cols); + // single thread to accumulate indptr + // indptr[num_rows] indicates the number of non-zero elements + indptr[0] = 0; + for (dim_t i = 0; i < num_rows; ++i) { + indptr[i+1] += indptr[i]; + } + // allocate column idx array and value array + csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast(indptr[num_rows]))); + csr->CheckAndAllocData(Shape1(static_cast(indptr[num_rows]))); + // fill col_idx and value arrays of the csr + mxnet_op::Kernel::Launch(s, num_threads, + csr->data().dptr(), csr->aux_data(csr::kIdx).dptr(), + indptr, dns_data, num_rows, num_cols); + }); + }); + }); +} + +/*! + * \brief This is the kernel for copying csr.data to its corresponding dns matrix. + */ +struct CopyCsrDataToDns { + /*! + * \brief + * \param i the i-th row of the dns tensor + * \param dns_data data blob of the dns tensor + * \param col_idx column idx array of the csr tensor + * \param indptr indptr array of the csr tensor + * \param csr_data data blob of the csr tensor + * \param num_cols number of columns of the dns tensor + */ + template + MSHADOW_XINLINE static void Map(int i, + DType* dns_data, + const CType* col_idx, + const IType* indptr, + const DType* csr_data, + const nnvm::dim_t num_cols) { + const nnvm::dim_t offset = i * num_cols; + for (IType j = indptr[i]; j < indptr[i+1]; ++j) { + dns_data[offset+col_idx[j]] = csr_data[j]; + } + } +}; + +/*! + * \brief Casts a csr matrix to dns format. + */ +template +void CastStorageCsrDnsImpl(const OpContext& ctx, + const NDArray& csr, + TBlob* dns) { + CHECK(dns != nullptr); + CHECK_EQ(csr.storage_type(), kCSRStorage); + CHECK_EQ(dns->shape_.ndim(), 2); + CHECK_EQ(dns->shape_, csr.shape()); + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, { // col idx type + const dim_t num_rows = dns->shape_[0]; + const dim_t num_cols = dns->shape_[1]; + DType* dns_data = dns->dptr(); + dim_t num_threads = dns->shape_.Size(); + mxnet_op::Kernel::Launch(s, num_threads, dns_data); + if (!csr.storage_initialized()) return; + const IType* indptr = csr.aux_data(csr::kIndPtr).dptr(); + const CType* col_idx = csr.aux_data(csr::kIdx).dptr(); + const DType* csr_data = csr.data().dptr(); + num_threads = num_rows; + mxnet_op::Kernel::Launch(s, num_threads, + dns_data, col_idx, indptr, csr_data, num_cols); + }); + }); + }); +} + +template +void CastStorageComputeImpl(const OpContext& ctx, + const NDArray& input, + const NDArray& output) { + const auto src_stype = input.storage_type(); + const auto dst_stype = output.storage_type(); + if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageRspDnsImpl(ctx, input, &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) { + NDArray ret = output; // get rid of the const qualifer + CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) { + NDArray ret = output; // get rid of the const qualifer + CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret); + } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageCsrDnsImpl(ctx, input, &ret); + } else { + LOG(FATAL) << "Not implemented"; + } +} + +struct CastStorageParam : public dmlc::Parameter { + int stype; + DMLC_DECLARE_PARAMETER(CastStorageParam) { + DMLC_DECLARE_FIELD(stype) + .add_enum("default", kDefaultStorage) + .add_enum("row_sparse", kRowSparseStorage) + .add_enum("csr", kCSRStorage) + .describe("Output storage type."); + } +}; + +inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + CHECK_NE(in_attrs->at(0), kUndefinedStorage) + << "src ndarray's storage type must be specified"; + const CastStorageParam& param = nnvm::get(attrs.parsed); + CHECK_NE(param.stype, kUndefinedStorage) + << "dst ndarray's storage type must be specified"; + TYPE_ASSIGN_CHECK(*out_attrs, 0, param.stype); + return true; +} + +template +void CastStorageComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1); + CHECK_EQ(outputs.size(), 1); + if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo"; + CastStorageComputeImpl(ctx, inputs[0], outputs[0]); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_ diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc new file mode 100644 index 000000000000..b5de8d0f08bd --- /dev/null +++ b/src/operator/tensor/cast_storage.cc @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cast_storage.cc + * \brief CPU Implementation of cast_storage operator. + */ + +#include "./cast_storage-inl.h" +#include "../elemwise_op_common.h" +#include "../tensor/elemwise_unary_op.h" + +namespace mxnet { +namespace op { + +DMLC_REGISTER_PARAMETER(CastStorageParam); +NNVM_REGISTER_OP(cast_storage) +.add_alias("_sparse_cast_storage") +.describe(R"code(Casts tensor storage type to the new type. + +When an NDArray with default storage type is cast to csr or row_sparse storage, +the result is compact, which means: + +- for csr, zero values will not be retained +- for row_sparse, row slices of all zeros will not be retained + +The storage type of ``cast_storage`` output depends on stype parameter: + +- cast_storage(csr, 'default') = default +- cast_storage(row_sparse, 'default') = default +- cast_storage(default, 'csr') = csr +- cast_storage(default, 'row_sparse') = row_sparse + +Example:: + + dense = [[ 0., 1., 0.], + [ 2., 0., 3.], + [ 0., 0., 0.], + [ 0., 0., 0.]] + + # cast to row_sparse storage type + rsp = cast_storage(default, 'default') + rsp.indices = [0, 1] + rsp.values = [[ 0., 1., 0.], + [ 2., 0., 3.]] + + # cast to row_sparse storage type + csr = cast_storage(default, 'default') + csr.indices = [1, 0, 2] + csr.values = [ 1., 2., 3.] + csr.indptr = [0, 1, 3, 3, 3] + +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", ElemwiseShape<1, 1>) +.set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferStorageType", CastStorageInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", CastStorageComputeEx) +.set_attr("FGradient", ElemwiseGradUseNone{"_copy"}) +.add_argument("data", "NDArray-or-Symbol", "The input.") +.add_arguments(CastStorageParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/cast_storage.cu b/src/operator/tensor/cast_storage.cu new file mode 100644 index 000000000000..1be5f79ae297 --- /dev/null +++ b/src/operator/tensor/cast_storage.cu @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cast_storage.cu + * \brief GPU Implementation of cast_storage operator. + */ +#include "./cast_storage-inl.h" +#include "../tensor/elemwise_unary_op.h" + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(cast_storage) +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", CastStorageComputeEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh new file mode 100644 index 000000000000..41c3faaf419f --- /dev/null +++ b/src/operator/tensor/dot-inl.cuh @@ -0,0 +1,883 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file dot-inl.cuh + * \brief implementation of matrix dot op on GPU + */ +#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_CUH_ +#define MXNET_OPERATOR_TENSOR_DOT_INL_CUH_ + +#include +#include +#include "./util/tensor_util-inl.cuh" + +namespace mxnet { +namespace op { + +/*! + * \brief GPU scalar kernel of dot(csr, dns1) = dns2 + * Parallelization by output matrix elements: 1 thread/element + */ +template +struct DotCsrDnsDnsScalarKernel { + /*! + * \brief This function represents performing an inner product between a row of lhs + * and a column of rhs and then assigning the value to out[tid]. + * \param tid global thread id + * \param out output matrix data + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r dns1 matrix data of rhs + * \param num_cols_r dns1 matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + const nnvm::dim_t irow = tid / num_cols_r; // row id of the lhs + const nnvm::dim_t icol = tid % num_cols_r; // col id of the rhs + DType sum = 0; + for (IType j = indptr_l[irow]; j < indptr_l[irow+1]; ++j) { + const CType cur_col = col_idx_l[j]; // corresponding row id of the rhs + sum += data_l[j] * data_r[cur_col*num_cols_r+icol]; + } + KERNEL_ASSIGN(out[tid], req, sum); + } +}; + +/*! + * \brief GPU vector kernel of dot(csr, dns1) = dns2 + * Parallelization by output matrix elements: 1 warp/element + */ +template +struct DotCsrDnsDnsVectorKernel { + /*! + * \brief see DotCsrDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + __shared__ volatile DType vals[mshadow::cuda::kBaseThreadNum]; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t irow = warp_id / num_cols_r; // lhs row that this warp computes + const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + + // Range of nnz elements in this row + const dim_t low = static_cast(indptr_l[irow]); + const dim_t high = static_cast(indptr_l[irow+1]); + + // Compute running sum per thread + DType sum = 0; + for (dim_t j = low+lane; j < high; j+=32) { + sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol]; + } + vals[threadIdx.x] = sum; __syncwarp(); + + // Parallel reduction in shared memory + if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp(); + if (lane < 8) {vals[threadIdx.x] += vals[threadIdx.x+ 8];} __syncwarp(); + if (lane < 4) {vals[threadIdx.x] += vals[threadIdx.x+ 4];} __syncwarp(); + if (lane < 2) {vals[threadIdx.x] += vals[threadIdx.x+ 2];} __syncwarp(); + if (lane < 1) {vals[threadIdx.x] += vals[threadIdx.x+ 1];} __syncwarp(); + + if (lane == 0) { + KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, vals[threadIdx.x]); + } + } +}; + +/*! + * \brief GPU scalar kernel of dot(csr.T, dns1) = dns2 + * Parallelization by output matrix elements: 1 thread/element + */ +template +struct DotCsrTransDnsDnsScalarKernel { + /*! + * \brief This function represents performing an inner product between a column of lhs + * and a column of rhs and then assigning the value to out[tid]. + * \param tid global thread id + * \param out output matrix + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r dns1 matrix data of rhs + * \param num_rows_l csr matrix number of rows (= number of columns of csr.T) + * \param num_cols_r dns1 matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t irow = tid / num_cols_r; // col id of the lhs + const dim_t icol = tid % num_cols_r; // col id of the rhs + DType sum = 0; + + // Each thread scans each column with binary search to find nnz elements in its row + for (dim_t k = 0; k < num_rows_l; ++k) { + const dim_t low = static_cast(indptr_l[k]); + const dim_t high = static_cast(indptr_l[k+1]); + if (low == high || irow < col_idx_l[low] || irow > col_idx_l[high-1]) continue; + dim_t j = high, l = low, r = high - 1; + while (l <= r) { + dim_t m = l + (r - l) / 2; + if (col_idx_l[m] == irow) { + j = m; break; + } + if (col_idx_l[m] < irow) { + l = m + 1; + } else { + r = m - 1; + } + } + if (j < high) { + sum += data_l[j] * data_r[k*num_cols_r+icol]; + } + } + KERNEL_ASSIGN(out[tid], req, sum); + } +}; + +/*! + * \brief GPU warp kernel of dot(csr.T, dns1) = dns2 + * Parallelization by columns: 1 warp computes one lhs column for one rhs column + */ +struct DotCsrTransDnsDnsWarpKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = warp_id / num_cols_r; // lhs column that this warp computes + const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + + // Compute range of nnz elements in this column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in this column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const DType val = data_l[j]*data_r[icol*num_cols_r+kcol]; + atomicAdd(static_cast(&(out[irow*num_cols_r+kcol])), val); + } + } +}; + +/*! + * \brief GPU thread block kernel of dot(csr.T, dns1) = dns2 + * Parallelization by columns: 1 thread block computes one lhs column for all rhs columns + */ +struct DotCsrTransDnsDnsThreadBlockKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warps_per_block = blockDim.x / 32; // number of warps in this thread block + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = blockIdx.x; // lhs column that this thread block computes + const dim_t kcol = warp_id % warps_per_block; // rhs column where warp starts computing (offset) + + // Compute range of nnz elements in this lhs column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in this lhs column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const DType datum_l = data_l[j]; + // Iterate over rhs columns that this warp computes + for (dim_t k = kcol; k < num_cols_r; k+=warps_per_block) { + const DType val = datum_l*data_r[icol*num_cols_r+k]; + atomicAdd(static_cast(&(out[irow*num_cols_r+k])), val); + } + } + } +}; + +/*! + * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2 + * Parallelization by columns: 1 warp computes one lhs column for all rhs columns + */ +struct DotCsrTransDnsDnsWarpBlockKernel { + /*! + * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation. + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = warp_id; // lhs column that this warp computes + + // Compute range of nnz elements in this column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in lhs column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const DType datum_l = data_l[j]; + // Iterate over all rhs columns + for (dim_t k = 0; k < num_cols_r; k++) { + const DType val = datum_l*data_r[icol*num_cols_r+k]; + atomicAdd(static_cast(&(out[irow*num_cols_r+k])), val); + } + } + } +}; + +/*! + * \brief GPU warp kernel of dot(csr.T, dns) = rsp + * Parallelization by columns: 1 warp computes one lhs column for one rhs column + */ +struct DotCsrTransDnsRspWarpKernel { + /*! + * \brief + * \param tid global thread id + * \param out output rsp matrix data + * \param row_flg_sum_out inclusive prefix sum array over 0/1 marked row flag array + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r dns matrix data + * \param num_cols_r dns matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const nnvm::dim_t* row_flg_sum_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t warp_id = tid / 32; // global warp id + const dim_t lane = tid & (32-1); // local thread id within warp + const dim_t icol = warp_id / num_cols_r; // lhs column that this warp computes + const dim_t kcol = warp_id % num_cols_r; // rhs column that this warp computes + + // Compute range of nnz elements in this column + const dim_t low = static_cast(indptr_l[icol]); + const dim_t high = static_cast(indptr_l[icol+1]); + + // Iterate through the nnz elements in this column + for (dim_t j = low+lane; j < high; j+=32) { + const dim_t irow = static_cast(col_idx_l[j]); + const dim_t rsp_row = row_flg_sum_out[irow]-1; + const DType val = data_l[j]*data_r[icol*num_cols_r+kcol]; + atomicAdd(static_cast(&(out[rsp_row*num_cols_r+kcol])), val); + } + } +}; + +/*! + * \brief GPU Kernel of dot(csr.T, rsp1) = rsp2 + * Parallelization by rows: 1 thread/row + * TODO: write a faster kernel optimized for GPU + */ +struct DotCsrTransRspRspByRowsKernel { + /*! + * \brief + * \param tid global thread id + * \param out output rsp matrix data + * \param row_idx_out output rsp matrix non-zero row indices + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r rsp1 matrix data + * \param row_idx_r rsp1 matrix non-zero row indices + * \param num_cols_r rsp1 matrix number of cols + * \param nnr_r rsp1 matrix number of non-zero rows + * \param nnr_out output rsp matrix number of non-zero rows + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const RType* row_idx_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const nnvm::dim_t num_cols_r, + const nnvm::dim_t nnr_r, + const nnvm::dim_t nnr_out) { + using nnvm::dim_t; + // This thread computes non-zero row 'tid' of the output matrix + // The actual row id corresponding to the lhs row is row_idx_out[tid] + if (tid < nnr_out) { + const dim_t offset_out = tid * num_cols_r; + // Iterate over rhs matrix rows (or, equivalently, lhs columns worthy taking a look at) + for (dim_t i = 0; i < nnr_r; i++) { + const RType j = row_idx_r[i]; // j is the actual rhs row id (= lhs column id) + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = i * num_cols_r; + // Iterate over lhs column j to find possible non-zero value in this row + // TODO: remove sequential search, this is a bottleneck + for (IType k = indptr_l[j]; k < indptr_l[j+1]; k++) { + const CType col_idx = col_idx_l[k]; + if (col_idx == row_idx_out[tid]) { + for (dim_t l = 0; l < num_cols_r; l++) { + out[offset_out+l] += data_l[k] * data_r[offset_r+l]; + } + } else if (col_idx > row_idx_out[tid]) { + break; + } + } + } + } + } +}; + +/*! + * \brief GPU Kernel of dot(csr, rsp) = dns + * Parallelization by output elements: 1 thread/element + */ +struct DotCsrRspDnsScalarKernel { + /*! + * \brief + * \param tid global thread id + * \param out output dns matrix data + * \param data_l csr matrix data + * \param indptr_l csr matrix row index pointer + * \param col_idx_l csr matrix column indices + * \param data_r rsp matrix data + * \param row_idx_r rsp matrix non-zero row indices + * \param row_flg_r rsp matrix auxiliary array holding storage indices of non-zero rows + * \param nnr_r rsp matrix number of non-zero rows + * \param num_rows output dns matrix number of rows + * \param num_cols output dns matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const RType* row_flg_r, + const nnvm::dim_t nnr_r, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + if (tid < num_rows*num_cols) { + const dim_t i = static_cast(tid) / num_cols; // i = row this thread computes + const dim_t k = static_cast(tid) % num_cols; // k = col this thread computes + // Compute inner product of i-th row and k-th col + DType sum = 0; + for (IType j = indptr_l[i]; j < indptr_l[i+1]; j++) { + const dim_t csr_col = col_idx_l[j]; + const dim_t rsp_row_idx = row_flg_r[csr_col]; + if (rsp_row_idx > 0) { + sum += data_l[j] * data_r[(rsp_row_idx-1)*num_cols+k]; + } + } + if (sum != 0) { + out[i*num_cols+k] += sum; + } + } + } +}; + +/*! + * \brief GPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2 + */ +inline void DotCsrDnsDnsImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + if (!lhs.storage_initialized()) return; + + using mshadow::cuda::kBaseThreadNum; + using mxnet_op::Kernel; + using mxnet_op::set_zero; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + + const dim_t num_rows_l = lhs.shape()[0]; + const dim_t num_cols_r = rhs.shape_[1]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + const dim_t threads_per_block = kBaseThreadNum; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "DotCsrDnsDnsImpl GPU kernels expect warpSize=32"; + } + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + const TBlob data_out = *ret; + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + if (kWriteTo == req) { + num_threads = data_out.Size(); + Kernel::Launch(s, num_threads, data_out.dptr()); + } + if (trans_lhs) { + // Different kernel versions are optimized for different matrix instances + // TODO: switch between kernel versions depending on input + // (1) 'Scalar kernel' (one thread computing one output element ) + // (2) 'Warp kernel' (one warp computing one lhs column for one rhs column ) + // (3) 'Thread block kernel' (one thread block computing one lhs column for all rhs columns) + // (4) 'Warp block kernel' (one warp computing one lhs column for all rhs columns) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = data_out.Size(); + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_rows_l, num_cols_r); + }); + break; + case 2: + num_threads = threads_per_warp * num_rows_l * num_cols_r; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + case 3: + num_threads = threads_per_block * num_rows_l; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + case 4: + num_threads = threads_per_warp * num_rows_l; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + default: + num_threads = threads_per_warp * num_rows_l * num_cols_r; + Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + break; + } + } else { + // Different kernel versions are optimized for different matrix instances + // (1) 'Scalar kernel' (one thread computing one output element) + // (2) 'Vector kernel' (one warp computing one output element) + const int kernel_version = 0; + switch (kernel_version) { + case 1: + num_threads = data_out.Size(); + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + break; + case 2: + num_threads = threads_per_warp * num_rows_l * num_cols_r; + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + break; + default: + if (num_cols_r > 4) { + num_threads = data_out.Size(); + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + } else { + num_threads = threads_per_warp * num_rows_l * num_cols_r; + MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { + Kernel, gpu>::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), num_cols_r); + }); + } + break; + } + } + }); + }); + }); +} + +/*! + * \brief GPU Impl of dot(csr, dns) = rsp and dot(csr.T, dns) = rsp + */ +inline void DotCsrDnsRspImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + CHECK_EQ(req, kWriteTo); + if (!lhs.storage_initialized()) return; + + using mshadow::Shape1; + using mxnet_op::Kernel; + using mxnet_op::set_zero; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + + const dim_t num_rows_l = lhs.shape()[0]; + const dim_t num_cols_l = lhs.shape()[1]; + const dim_t num_cols_r = rhs.shape_[1]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "DotCsrDnsRspImpl GPU kernels expect warpSize=32"; + } + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + if (trans_lhs) { + // Compute number of non-zero rows (nnr) of output matrix + // - alloc temp storage for row_flg array and for cub's prefix sum + // - mark non-zero columns of csr matrix in row_flg + // - compute inclusive prefix sum over marked array + // - copy last value (nnr_out) from device to host + dim_t* row_flg_out = NULL; + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + mshadow::Tensor workspace = ctx.requested[0] + .get_space_typed(Shape1(num_cols_l * sizeof(dim_t) + + temp_storage_bytes), s); + row_flg_out = reinterpret_cast(workspace.dptr_); + d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t); + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, row_flg_out); + num_threads = num_rows_l * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg_out, col_idx_l.dptr(), indptr_l.dptr(), + num_rows_l, num_cols_l); + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + dim_t nnr_out = 0; + CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t), + cudaMemcpyDeviceToHost)); + + // Allocate output matrix space + ret->CheckAndAlloc({Shape1(nnr_out)}); + const TBlob data_out_blob = ret->data(); + const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx); + MSHADOW_IDX_TYPE_SWITCH(row_idx_out_blob.type_flag_, RType, { // row idx type + DType* data_out = data_out_blob.dptr(); + RType* row_idx_out = row_idx_out_blob.dptr(); + num_threads = nnr_out * num_cols_r; + Kernel::Launch(s, num_threads, data_out); + num_threads = nnr_out; + Kernel::Launch(s, num_threads, row_idx_out); + + // Fill row_idx array of output matrix, using the row_flg values + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, + row_idx_out, row_flg_out, num_cols_l); + + // Perform matrix-matrix multiply + num_threads = threads_per_warp * num_rows_l * num_cols_r; + Kernel::Launch(s, num_threads, + data_out, row_flg_out, + data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), + data_r.dptr(), num_cols_r); + }); + } else { + LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns) = rsp yet."; + } + }); + }); + }); +} + +/*! + * \brief GPU Impl of dot(csr, rsp1) = rsp2 and dot(csr.T, rsp1) = rsp2 + * TODO: Optimize for GPU; this is a baseline implementation providing + * the operator functionality, it is not yet fully optimized for GPU. + */ +inline void DotCsrRspRspImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense + if (rhs.storage_shape()[0] == rhs.shape()[0]) { + DotCsrDnsRspImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) return; + CHECK_EQ(req, kWriteTo); + + using mshadow::Shape1; + using mxnet_op::Kernel; + using mxnet_op::set_zero; + using nnvm::dim_t; + mshadow::Stream* s = ctx.get_stream(); + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + const dim_t num_rows_l = lhs.shape()[0]; + const dim_t num_cols_l = lhs.shape()[1]; + const dim_t num_cols_r = rhs.shape()[1]; + const dim_t nnr_r = rhs.storage_shape()[0]; + const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize; + dim_t num_threads; + // TODO: remove kernel dependency on warpSize=32 + if (threads_per_warp != 32) { + LOG(FATAL) << "DotCsrRspRspImpl GPU kernels expect warpSize=32"; + } + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + if (trans_lhs) { + // Compute number of non-zero rows (nnr) of output matrix + // - alloc temp storage for row_flg array and for cub's prefix sum + // - mark non-zero columns of csr matrix in row_flg + // - compute inclusive prefix sum over marked array + // - copy last value (nnr_out) from device to host + dim_t* row_flg_out = NULL; + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + mshadow::Tensor workspace = ctx.requested[0] + .get_space_typed(Shape1(num_cols_l * sizeof(dim_t) + + temp_storage_bytes), s); + row_flg_out = reinterpret_cast(workspace.dptr_); + d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t); + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, row_flg_out); + num_threads = num_rows_l * threads_per_warp; + Kernel::Launch(s, num_threads, + row_flg_out, col_idx_l.dptr(), indptr_l.dptr(), + num_rows_l, num_cols_l); + cub::DeviceScan::InclusiveSum(d_temp_storage, + temp_storage_bytes, + row_flg_out, + row_flg_out, + num_cols_l, + mshadow::Stream::GetStream(s)); + dim_t nnr_out = 0; + CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t), + cudaMemcpyDeviceToHost)); + + // Allocate output matrix space + ret->CheckAndAlloc({mshadow::Shape1(nnr_out)}); + const TBlob data_out_blob = ret->data(); + const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx); + DType* data_out = data_out_blob.dptr(); + RType* row_idx_out = row_idx_out_blob.dptr(); + num_threads = nnr_out * num_cols_r; + Kernel::Launch(s, num_threads, data_out); + num_threads = nnr_out; + Kernel::Launch(s, num_threads, row_idx_out); + + // Fill row_idx array of output matrix, using the row_flg values + num_threads = num_cols_l; + Kernel::Launch(s, num_threads, + row_idx_out, row_flg_out, num_cols_l); + + // Perform matrix-matrix multiply + num_threads = nnr_out; + Kernel::Launch(s, num_threads, + data_out, row_idx_out, + data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), + data_r.dptr(), row_idx_r.dptr(), + num_cols_r, nnr_r, nnr_out); + } else { + LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp1) = rsp2 yet."; + } + }); + }); + }); + }); +} + +/*! + * \brief GPU Impl of dot(csr, rsp) = dns and dot(csr.T, rsp) = dns + */ +inline void DotCsrRspDnsImpl(const OpContext& ctx, + const gpu& gpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense + if (rhs.storage_shape()[0] == rhs.shape()[0]) { + DotCsrDnsDnsImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + + using mxnet_op::Kernel; + using mxnet_op::set_zero; + mshadow::Stream* s = ctx.get_stream(); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) { + if (kWriteTo == req) { + MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, { // data type + Kernel::Launch(s, ret->Size(), ret->dptr()); + }); + } + return; + } + + using nnvm::dim_t; + const dim_t num_rows = ret->shape_[0]; + const dim_t num_cols = ret->shape_[1]; + const dim_t nnr_r = rhs.storage_shape()[0]; + dim_t num_threads; + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + if (kWriteTo == req) { + num_threads = num_rows*num_cols; + Kernel::Launch(s, num_threads, ret->dptr()); + } + if (trans_lhs) { + LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet."; + } else { + // TODO: Consider implementing a vector kernel for SpMV (similar to DotCsrDnsDns) + // Alloc temp storage for row_flg array + RType* row_flg_r = ctx.requested[0] + .get_space_typed(mshadow::Shape1(rhs.shape()[0]), s).dptr_; + num_threads = rhs.shape()[0]; + Kernel::Launch(s, num_threads, row_flg_r); + // Set row_flg index array + num_threads = nnr_r; + Kernel::Launch(s, num_threads, + row_flg_r, row_idx_r.dptr(), nnr_r); + // Perform sparse matrix-matrix multiply + num_threads = num_rows*num_cols; + Kernel::Launch(s, num_threads, + ret->dptr(), + data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), + data_r.dptr(), row_idx_r.dptr(), row_flg_r, rhs.storage_shape()[0], + num_rows, num_cols); + } + }); + }); + }); + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_DOT_INL_CUH_ diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h new file mode 100644 index 000000000000..aaf242e26fe1 --- /dev/null +++ b/src/operator/tensor/dot-inl.h @@ -0,0 +1,1007 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dot-inl.h + * \brief Function definition of matrix dot operator + */ + +#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_H_ +#define MXNET_OPERATOR_TENSOR_DOT_INL_H_ + +#include +#include +#include +#include +#include +#include "../mshadow_op.h" +#include "../elemwise_op_common.h" +#include "../mxnet_op.h" +#ifdef __CUDACC__ +#include "./dot-inl.cuh" +#endif // __CUDACC__ + +namespace mxnet { +namespace op { + +struct DotParam : public dmlc::Parameter { + bool transpose_a; + bool transpose_b; + DMLC_DECLARE_PARAMETER(DotParam) { + DMLC_DECLARE_FIELD(transpose_a) + .describe("If true then transpose the first input before dot.") + .set_default(false); + DMLC_DECLARE_FIELD(transpose_b) + .describe("If true then transpose the second input before dot.") + .set_default(false); + } +}; + +template +void DotForward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + const DotParam& param = nnvm::get(attrs.parsed); + Stream *s = ctx.get_stream(); + CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) + << "dot only supports float32 and float64"; + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) { + CHECK_NE(req[0], kAddTo) << "AddTo not yet suported"; + Tensor out = outputs[0].get(s); + VectorDot(out, + inputs[0].get(s), + inputs[1].get(s)); + } else { + int ma, na, mb, nb, m, n; + if (param.transpose_a) { + ma = inputs[0].size(0); + na = inputs[0].Size()/ma; + m = na; + } else { + na = inputs[0].size(inputs[0].ndim()-1); + ma = inputs[0].Size()/na; + m = ma; + } + if (param.transpose_b) { + nb = inputs[1].size(inputs[1].ndim()-1); + mb = inputs[1].Size()/nb; + n = mb; + } else { + mb = inputs[1].size(0); + nb = inputs[1].Size()/mb; + n = nb; + } + Tensor input0 = + inputs[0].get_with_shape(Shape2(ma, na), s); + Tensor input1 = + inputs[1].get_with_shape(Shape2(mb, nb), s); + Tensor out = + outputs[0].get_with_shape(Shape2(m, n), s); + if (param.transpose_a && param.transpose_b) { + ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T())); + } else if (!param.transpose_a && param.transpose_b) { + ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T())); + } else if (param.transpose_a && !param.transpose_b) { + ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1)); + } else { + ASSIGN_DISPATCH(out, req[0], dot(input0, input1)); + } + } + }); +} + +template +void DotBackward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + const DotParam& param = nnvm::get(attrs.parsed); + Stream *s = ctx.get_stream(); + CHECK_NE(req[0], kWriteInplace); + CHECK_NE(req[1], kWriteInplace); + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) { + Tensor mout_grad = inputs[0].get(s); + Tensor mlhs_data = inputs[1].get(s); + Tensor mrhs_data = inputs[2].get(s); + Tensor mlhs_grad = outputs[0].get(s); + Tensor mrhs_grad = outputs[1].get(s); + ASSIGN_DISPATCH(mrhs_grad, req[1], + broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data); + ASSIGN_DISPATCH(mlhs_grad, req[0], + broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data); + } else { + int ma, na, mb, nb, m, n; + if (param.transpose_a) { + ma = outputs[0].size(0); + na = outputs[0].Size()/ma; + m = na; + } else { + na = outputs[0].size(outputs[0].ndim()-1); + ma = outputs[0].Size()/na; + m = ma; + } + if (param.transpose_b) { + nb = outputs[1].size(outputs[1].ndim()-1); + mb = outputs[1].Size()/nb; + n = mb; + } else { + mb = outputs[1].size(0); + nb = outputs[1].Size()/mb; + n = nb; + } + Tensor mout_grad = + inputs[0].get_with_shape(Shape2(m, n), s); + Tensor mlhs_data = + inputs[1].get_with_shape(Shape2(ma, na), s); + Tensor mrhs_data = + inputs[2].get_with_shape(Shape2(mb, nb), s); + Tensor mlhs_grad = + outputs[0].get_with_shape(Shape2(ma, na), s); + Tensor mrhs_grad = + outputs[1].get_with_shape(Shape2(mb, nb), s); + if (param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x.T, y.T) + // dy = dot(x, dz).T = dot(dz.T, x.T) + // dx = dot(dz, y).T = dot(y.T, dz.T) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T())); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T())); + } else if (!param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x, y.T) + // dy = dot(x.T, dz).T = dot(dz.T, x) + // dx = dot(dz, y) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data)); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data)); + } else if (param.transpose_a && !param.transpose_b) { + // Gradient of z = dot(x.T, y) + // dy = dot(x, dz) + // dx = dot(dz, y.T).T = dot(y, dz.T) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad)); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T())); + } else { + // Gradient of z = dot(x, y) + // dy = dot(x.T, dz) + // dx = dot(dz, y.T) + ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad)); + ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T())); + } + } + }); +} + +inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + const DotParam& param = nnvm::get(attrs.parsed); + // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp + // TODO(stefan/haibin/jun): check type_assign return value + if (param.transpose_a && kCSRStorage == (*in_attrs)[0]) { + type_assign(&((*out_attrs)[0]), kRowSparseStorage); + } else { + type_assign(&((*out_attrs)[0]), kDefaultStorage); + } + return true; +} + +inline bool DotBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), 2U); + const DotParam& param = nnvm::get(attrs.parsed); + type_assign(&((*out_attrs)[0]), kDefaultStorage); + if (!param.transpose_a && kCSRStorage == (*in_attrs)[1]) { + type_assign(&((*out_attrs)[1]), kRowSparseStorage); + } else { + type_assign(&((*out_attrs)[1]), kDefaultStorage); + } + return true; +} + +/*! + * \brief CPU Kernel of dot(csr, dns1) = dns2 + * Parallelization by row blocks + */ +struct DotCsrDnsDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = std::min(seg_start + seg_len, num_rows); + for (dim_t j = seg_start; j < seg_end; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_out = j * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const DType val = data_l[k]; + const dim_t offset_r = col_idx_l[k] * num_cols; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * val; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr.T(), dns1) = dns2 + * Parallelization by row blocks + */ +struct DotCsrTransDnsDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = (i + 1) * seg_len; + for (dim_t j = 0; j < num_rows_l; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = j * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const CType col_idx = col_idx_l[k]; + if (col_idx < seg_start || col_idx >= seg_end) continue; + const dim_t offset_out = col_idx * num_cols; + const DType val = data_l[k]; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * val; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr.T(), dns) = rsp + * Parallelization by row blocks. + * This kernel fills up the row_idx array of the rsp + * with 1 for nonzero rows and 0 for zero rows. + * The matrix will be compacted after this kernel call. + */ +struct DotCsrTransDnsRspByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + RType* row_idx, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = (i + 1) * seg_len; + for (dim_t j = 0; j < num_rows_l; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = j * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const CType col_idx = col_idx_l[k]; + if (col_idx < seg_start || col_idx >= seg_end) continue; + const dim_t offset_out = col_idx * num_cols; + row_idx[col_idx] = 1; + const DType val = data_l[k]; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * val; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr, rsp) = dns + * Parallelization by row blocks + */ +struct DotCsrRspDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + * \param nnr_r storage_shape[0] of the rsp + * \param num_rows dns.shape[0] + * \param num_cols dns.shape[1] + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const nnvm::dim_t nnr_r, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols, + const nnvm::dim_t seg_len) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = std::min(seg_start + seg_len, num_rows); + for (dim_t j = seg_start; j < seg_end; ++j) { + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_out = j * num_cols; + // Use binary search to find the lower_bound of val in row_idx array + const RType* first = row_idx_r; + const RType* last = row_idx_r + nnr_r; + const CType val = col_idx_l[indptr_l[j]]; + const RType* it; + int count = last - first, step; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (*it < val) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + const RType* row_idx_ptr = first; + // end of binary search + if (row_idx_ptr == row_idx_r+nnr_r || *row_idx_ptr > col_idx_l[indptr_l[j+1]-1]) continue; + for (IType k = indptr_l[j]; k < indptr_l[j+1] && row_idx_ptr != row_idx_r+nnr_r;) { + if (col_idx_l[k] == *row_idx_ptr) { + const dim_t offset_r = (row_idx_ptr - row_idx_r) * num_cols; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_l[k] * data_r[offset_r+l]; + } + ++k; + ++row_idx_ptr; + } else if (col_idx_l[k] < *row_idx_ptr) { + ++k; + } else { + ++row_idx_ptr; + } + } + } + } +}; + +/*! + * \brief CPU Kernel of dot(csr.T(), rsp1) = rsp2, with row_idx marked for non-zero rows + * Parallelization by row blocks + */ +struct DotCsrTransRspRspByRowBlocks { + /*! + * \brief + * \param i the i-th thread + * \param num_rows_l number of rows of lhs matrix + * \param nnr_r number of non-zero rows of rhs matrix + * \param num_rows number of rows of out matrix + * \param num_cols number of cols of out matrix + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + RType* row_idx_out, + const DType* data_l, + const IType* indptr_l, + const CType* col_idx_l, + const DType* data_r, + const RType* row_idx_r, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t nnr_r, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols, + const nnvm::dim_t seg_len) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows) return; + const dim_t seg_end = (i + 1) * seg_len; + for (dim_t rid = 0; rid < nnr_r; ++rid) { + const RType j = row_idx_r[rid]; + if (indptr_l[j] == indptr_l[j+1]) continue; + const dim_t offset_r = rid * num_cols; + for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) { + const CType col_idx = col_idx_l[k]; + if (col_idx < seg_start || col_idx >= seg_end) continue; + row_idx_out[col_idx] = 1; // mark nonzero row as 1 + const dim_t offset_out = col_idx * num_cols; + for (dim_t l = 0; l < num_cols; ++l) { + out[offset_out+l] += data_r[offset_r+l] * data_l[k]; + } + } + } + } +}; + +/*! + * \brief CPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2 + */ +inline void DotCsrDnsDnsImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + if (!lhs.storage_initialized()) return; + + using nnvm::dim_t; + + mshadow::Stream* s = ctx.get_stream(); + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + const TBlob data_out = *ret; + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + dim_t num_threads; + if (kWriteTo == req) { + num_threads = data_out.Size(); + mxnet_op::Kernel::Launch( + s, num_threads, data_out.dptr()); + } + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), seg_len, + lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]); + } else { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), indptr_l.dptr(), + col_idx_l.dptr(), data_r.dptr(), seg_len, + data_out.shape_[0], data_out.shape_[1]); + } + }); + }); + }); +} + +/*! + * \brief CPU Impl of dot(csr.T, dns) = rsp + */ +inline void DotCsrDnsRspImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const TBlob& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + if (!lhs.storage_initialized()) return; + CHECK_EQ(req, kWriteTo); + + using mxnet_op::set_zero; + using nnvm::dim_t; + + mshadow::Stream* s = ctx.get_stream(); + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob& data_r = rhs; + + // pre-allocate spaces for ret using the dense dimension size + ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])}); + const TBlob data_out = ret->data(); + const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, { // row idx type + dim_t num_threads = data_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, data_out.dptr()); + RType* row_idx = row_idx_out.dptr(); + num_threads = row_idx_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, row_idx); + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), row_idx, data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + seg_len, lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]); + dim_t nnr = 0; + nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr); + ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr)); + if (0 == nnr) return; + mshadow::Tensor rsp_data = data_out.FlatTo2D(s); + dim_t idx = 0; + for (index_t i = 0; i < ret->shape()[0]; ++i) { + if (row_idx[i] > 0) { + row_idx[idx] = i; + mshadow::Copy(rsp_data[idx], rsp_data[i], s); + ++idx; + } + } + } else { + LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns)=rsp yet."; + } + }); + }); + }); + }); +} + +/*! + * \brief CPU Impl of dot(csr, rsp) = dns + */ +inline void DotCsrRspDnsImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + TBlob* ret) { + if (kNullOp == req) return; + // reuse csr dns implementation when storage_shape == shape for rhs + if (rhs.storage_shape()[0] == rhs.shape()[0]) { // if rsp is actually dense + DotCsrDnsDnsImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + mshadow::Stream* s = ctx.get_stream(); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) { + if (kWriteTo == req) { + MSHADOW_SGL_DBL_TYPE_SWITCH(ret->type_flag_, DType, { // data type + mxnet_op::Kernel::Launch( + s, ret->Size(), ret->dptr()); + }); + } + return; + } + using nnvm::dim_t; + + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + dim_t num_threads; + if (kWriteTo == req) { + num_threads = ret->Size(); + mxnet_op::Kernel::Launch(s, num_threads, + ret->dptr()); + } + num_threads = mxnet_op::get_num_threads(ret->shape_[0]); + dim_t seg_len = (ret->shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet"; + } else { + mxnet_op::Kernel::Launch(s, num_threads, + ret->dptr(), data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + row_idx_r.dptr(), rhs.storage_shape()[0], + ret->shape_[0], ret->shape_[1], seg_len); + } + }); + }); + }); + }); +} + +/*! + * \brief CPU Impl of dot(csr.T, rsp1) = rsp2 + */ +inline void DotCsrRspRspImpl(const OpContext& ctx, + const cpu& cpu_dev, + const NDArray& lhs, + const NDArray& rhs, + const OpReqType req, + const bool trans_lhs, + NDArray* ret) { + if (kNullOp == req) return; + // reuse csr dns implementation when storage_shape == shape for rhs + if (rhs.storage_shape()[0] == rhs.shape()[0]) { // if rsp is actually dense + DotCsrDnsRspImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret); + return; + } + + CHECK_EQ(lhs.storage_type(), kCSRStorage); + CHECK_EQ(rhs.storage_type(), kRowSparseStorage); + CHECK_EQ(ret->storage_type(), kRowSparseStorage); + if (!lhs.storage_initialized() || !rhs.storage_initialized()) return; + CHECK_EQ(req, kWriteTo); + + using mxnet_op::set_zero; + using nnvm::dim_t; + + mshadow::Stream* s = ctx.get_stream(); + const TBlob data_l = lhs.data(); + const TBlob indptr_l = lhs.aux_data(csr::kIndPtr); + const TBlob col_idx_l = lhs.aux_data(csr::kIdx); + const TBlob data_r = rhs.data(); + const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx); + + // pre-allocate spaces for ret using the dense dimension size + if (ret->storage_type() == kRowSparseStorage) { + ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])}); + } + const TBlob data_out = ret->data(); + const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type + MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, { // row idx type + dim_t num_threads = data_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, data_out.dptr()); + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (trans_lhs) { + RType* row_idx = row_idx_out.dptr(); + num_threads = row_idx_out.Size(); + mxnet_op::Kernel::Launch(s, num_threads, row_idx); + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), row_idx, data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + row_idx_r.dptr(), lhs.shape()[0], rhs.storage_shape()[0], + ret->shape()[0], ret->shape()[1], seg_len); + dim_t nnr = 0; + nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr); + ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr)); + if (0 == nnr) return; + mshadow::Tensor rsp_data = data_out.FlatTo2D(s); + dim_t idx = 0; + for (index_t i = 0; i < ret->shape()[0]; ++i) { + if (row_idx[i] > 0) { + row_idx[idx] = i; + mshadow::Copy(rsp_data[idx], rsp_data[i], s); + ++idx; + } + } + } else { + LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp) = rsp2 yet"; + } + }); + }); + }); + }); +} + +inline bool DotShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const DotParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + TShape& lshape = (*in_attrs)[0]; + TShape& rshape = (*in_attrs)[1]; + if (lshape.ndim() == 1 && rshape.ndim() == 1) { + CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors"; + CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1)); + } else { + bool Ta = param.transpose_a, Tb = param.transpose_b; + TShape L[2], R[2]; + if (Ta) { + L[0] = mshadow::Shape1(lshape[0]); + L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1); + } else { + L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1); + L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]); + } + if (Tb) { + R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1); + R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]); + } else { + R[0] = mshadow::Shape1(rshape[0]); + R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1); + } + + if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) { + CHECK_EQ(L[!Ta].Size(), R[Tb].Size()) + << "dot shape error: " << lshape << " X " << rshape; + } + std::vector buf; + if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]); + if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]); + TShape oshape(buf.begin(), buf.end()); + SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape); + } + return true; +} + +template +void DotForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const DotParam& param = nnvm::get(attrs.parsed); + CHECK(!param.transpose_b) << "transposing rhs of the sparse dot op is not supported"; + CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs"; + CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs"; + auto lhs_stype = inputs[0].storage_type(); + auto rhs_stype = inputs[1].storage_type(); + auto out_stype = outputs[0].storage_type(); + if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage && out_stype == kDefaultStorage) { + TBlob ret = outputs[0].data(); + DotCsrDnsDnsImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &ret); + } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage + && out_stype == kDefaultStorage) { + TBlob ret = outputs[0].data(); + DotCsrRspDnsImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret); + } else if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage + && out_stype == kRowSparseStorage) { + NDArray out = outputs[0]; + DotCsrDnsRspImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &out); + } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage + && out_stype == kRowSparseStorage) { + NDArray ret = outputs[0]; + DotCsrRspRspImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, DotForward_, "DotForward_"); + } +} + +template +void DotBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 2U); + CHECK_EQ(req.size(), 2U); + CHECK_EQ(kNullOp, req[0]) + << "sparse dot does not support computing the gradient of the csr/lhs"; + CHECK_NE(req[1], kWriteInplace) << "DotBackwardEx does not support WriteInplace"; + + const DotParam& param = nnvm::get(attrs.parsed); + CHECK(!param.transpose_b) << "sparse dot only supports dot(A, X) and dot(A.T(), X)"; + CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs"; + CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs"; + const auto ograd_stype = inputs[0].storage_type(); + const auto lhs_stype = inputs[1].storage_type(); + const auto rhs_stype = inputs[2].storage_type(); + const auto grad_rhs_stype = outputs[1].storage_type(); + if (ograd_stype == kDefaultStorage // ograd dns format + && lhs_stype == kCSRStorage // csr input lhs of the op + && grad_rhs_stype == kDefaultStorage) { // grad(rhs) dns format + TBlob ret = outputs[1].data(); + DotCsrDnsDnsImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret); + } else if (ograd_stype == kDefaultStorage + && lhs_stype == kCSRStorage + && grad_rhs_stype == kRowSparseStorage) { + NDArray ret = outputs[1]; + DotCsrDnsRspImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, DotBackward_, "DotBackward_"); + } +} + +template +void BatchDotForward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + mshadow::Stream *s = ctx.get_stream(); + const DotParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) + << "Binary function only support input/output with the same type"; + CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) + << "dot only supports float32 and float64"; + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + mshadow::Tensor out = outputs[0].get(s); + mshadow::Tensor mlhs = inputs[0].get(s); + mshadow::Tensor mrhs = inputs[1].get(s); + mshadow::Tensor workspace = + ctx.requested[0].get_space_typed(mshadow::Shape1(3 * out.size(0)), s); + if (kNullOp != req[0]) { + if (param.transpose_a && param.transpose_b) { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } else if (!param.transpose_a && param.transpose_b) { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } else if (param.transpose_a && !param.transpose_b) { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } else { + mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + workspace); + } + } + }); +} + +template +void BatchDotBackward_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + mshadow::Stream *s = ctx.get_stream(); + const DotParam& param = nnvm::get(attrs.parsed); + CHECK_NE(req[1], kWriteInplace); + CHECK_NE(req[0], kWriteInplace); + CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) + << "dot only supports float32 and float64"; + MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + mshadow::Tensor mout_grad = inputs[0].get(s); + mshadow::Tensor mlhs_data = inputs[1].get(s); + mshadow::Tensor mrhs_data = inputs[2].get(s); + mshadow::Tensor mlhs_grad = outputs[0].get(s); + mshadow::Tensor mrhs_grad = outputs[1].get(s); + mshadow::Tensor workspace = + ctx.requested[0].get_space_typed( + mshadow::Shape2(2, 3 * mout_grad.size(0)), s); + mshadow::Tensor rhs_workspace = workspace[0]; + mshadow::Tensor lhs_workspace = workspace[1]; + if (param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x.T, y.T) + // dy = dot(x, dz).T = dot(dz.T, x.T) + // dx = dot(dz, y).T = dot(y.T, dz.T) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } else if (!param.transpose_a && param.transpose_b) { + // Gradient of z = dot(x, y.T) + // dy = dot(x.T, dz).T = dot(dz.T, x) + // dx = dot(dz, y) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } else if (param.transpose_a && !param.transpose_b) { + // Gradient of z = dot(x.T, y) + // dy = dot(x, dz) + // dx = dot(dz, y.T).T = dot(y, dz.T) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } else { + // Gradient of z = dot(x, y) + // dy = dot(x.T, dz) + // dx = dot(dz, y.T) + if (kNullOp != req[1]) { + mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, + (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, + rhs_workspace); + } + if (kNullOp != req[0]) { + mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, + (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, + lhs_workspace); + } + } + }); +} + +inline bool BatchDotShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + const DotParam& param = nnvm::get(attrs.parsed); + TShape& lshape = (*in_attrs)[0]; + TShape& rshape = (*in_attrs)[1]; + if (lshape.ndim() == 3 && rshape.ndim() == 3) { + CHECK(lshape[0] == rshape[0]) + << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape + << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; + index_t out_m = param.transpose_a ? lshape[2] : lshape[1]; + index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2]; + index_t out_n = param.transpose_b ? rshape[1] : rshape[2]; + index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1]; + CHECK(lshape_k == rshape_k) + << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape + << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n)); + } else { + LOG(FATAL) << "batch_dot currently only support 3D*3D array" + << lshape << " v.s. " << rshape; + } + return true; +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_DOT_INL_H_ diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc new file mode 100644 index 000000000000..a7fa2c7933a5 --- /dev/null +++ b/src/operator/tensor/dot.cc @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dot.cc + * \brief CPU Implementation of matrix dot + */ + +#include "./dot-inl.h" + +namespace mxnet { +namespace op { +DMLC_REGISTER_PARAMETER(DotParam); + +NNVM_REGISTER_OP(dot) +.add_alias("_sparse_dot") // alias for op registration under mxnet.ndarray.sparse +.describe(R"doc(Dot product of two arrays. + +``dot``'s behavior depends on the input array dimensions: + +- 1-D arrays: inner product of vectors +- 2-D arrays: matrix multiplication +- N-D arrays: a sum product over the last axis of the first input and the first + axis of the second input + + For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the + result array will have shape `(n,m,r,s)`. It is computed by:: + + dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b]) + + Example:: + + x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2)) + y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2)) + dot(x,y)[0,0,1,1] = 0 + sum(x[0,0,:]*y[:,1,1]) = 0 + +The storage type of ``dot`` output depends on storage types of inputs and transpose options: + +- dot(csr, default) = default +- dot(csr.T, default) = row_sparse +- dot(csr, row_sparse) = default +- otherwise, ``dot`` generates output with default storage + +)doc" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"lhs", "rhs"}; + }) +.set_attr("FInferShape", DotShape) +.set_attr("FInferType", ElemwiseType<2, 1>) +.set_attr("FInferStorageType", DotForwardInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", DotForward_) +.set_attr("FComputeEx", DotForwardEx) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_dot"}) +.add_argument("lhs", "NDArray-or-Symbol", "The first input") +.add_argument("rhs", "NDArray-or-Symbol", "The second input") +.add_arguments(DotParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_dot) +.set_num_inputs(3) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", DotBackwardInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", DotBackward_) +.set_attr("FComputeEx", DotBackwardEx) +.add_arguments(DotParam::__FIELDS__()); + +NNVM_REGISTER_OP(batch_dot) +.describe(R"doc(Batchwise dot product. + +``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and +``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`. + +For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape +`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`, +which is computed by:: + + batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:]) + +)doc" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"lhs", "rhs"}; + }) +.set_attr("FInferShape", BatchDotShape) +.set_attr("FInferType", ElemwiseType<2, 1>) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", BatchDotForward_) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"}) +.add_argument("lhs", "NDArray-or-Symbol", "The first input") +.add_argument("rhs", "NDArray-or-Symbol", "The second input") +.add_arguments(DotParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_batch_dot) +.set_num_inputs(3) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("TIsBackward", true) +.set_attr("FCompute", BatchDotBackward_); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu new file mode 100644 index 000000000000..8ee2e2832fbb --- /dev/null +++ b/src/operator/tensor/dot.cu @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dot.cu + * \brief GPU Implementation of matrix dot + */ + +#include "./dot-inl.h" + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(dot) +.set_attr("FCompute", DotForward_) +.set_attr("FComputeEx", DotForwardEx); + +NNVM_REGISTER_OP(_backward_dot) +.set_attr("FCompute", DotBackward_) +.set_attr("FComputeEx", DotBackwardEx); + +NNVM_REGISTER_OP(batch_dot) +.set_attr("FCompute", BatchDotForward_); + +NNVM_REGISTER_OP(_backward_batch_dot) +.set_attr("FCompute", BatchDotBackward_); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc index c80d46a883ea..8c97849e20dc 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc @@ -123,6 +123,7 @@ Example:: .set_attr("FCompute", BinaryBroadcastCompute) .set_attr("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"}); + NNVM_REGISTER_OP(_backward_broadcast_mul) .set_num_inputs(3) .set_num_outputs(2) diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 87b0d46a63c9..ddcad5e61ba0 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -28,10 +28,12 @@ #include #include #include +#include #include "../mxnet_op.h" #include "../mshadow_op.h" #include "../elemwise_op_common.h" -#include "../mxnet_op.h" +#include "./init_op.h" +#include "../../common/utils.h" namespace mxnet { namespace op { @@ -141,6 +143,120 @@ void BinaryBackwardUseNone_(const nnvm::NodeAttrs& attrs, } } +// TODO(haibin) This is a single-thread inefficient implementation +// This implementation only works on CPU +template +void BinaryComputeRspRspImpl(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (req[0] == kNullOp) return; + CHECK(req[0] == kWriteTo) << "only kWriteTo is supported for rowsparse elemwise_add"; + using namespace rowsparse; + using namespace mshadow; + auto &lhs = inputs[0]; + auto &rhs = inputs[1]; + auto &output = outputs[0]; + + bool init_l = lhs.storage_initialized(); + bool init_r = rhs.storage_initialized(); + Stream *s = ctx.get_stream(); + // both inputs are zeros + if (!init_l && !init_r) { + NDArray out = output; + FillZerosRspImpl(s, &out); + return; + } + // Memory Estimation: This is (roughly) the number of result rows. We still + // need to subtract the number of common rows + unsigned int num_rows_l = lhs.aux_shape(kIdx)[0]; + unsigned int num_rows_r = rhs.aux_shape(kIdx)[0]; + unsigned int num_rows_total = num_rows_l + num_rows_r; + auto row_len = output.shape().ProdShape(1, output.shape().ndim()); + output.CheckAndAlloc({Shape1(num_rows_total)}); + CHECK_GT(row_len, 0); + MSHADOW_TYPE_SWITCH(output.dtype(), DType, { + MSHADOW_TYPE_SWITCH(lhs.aux_type(kIdx), IType, { + // Indices + auto indices_l = lhs.aux_data(kIdx).dptr(); + auto indices_r = rhs.aux_data(kIdx).dptr(); + auto indices_out = output.aux_data(kIdx).dptr(); + // Data + auto data_l = lhs.data().get_with_shape(Shape2(num_rows_l, row_len), s); + auto data_r = rhs.data().get_with_shape(Shape2(num_rows_r, row_len), s); + auto out = output.data().get_with_shape(Shape2(num_rows_total, row_len), s); + + // TODO(haibin) A more appropriate way: Copy to output, then apply ops + size_t iter_l = 0; + size_t iter_r = 0; + size_t iter_out = 0; + int32_t num_common_rows = 0; + while (iter_l < num_rows_l && iter_r < num_rows_r) { + auto idx_l = indices_l[iter_l]; + auto idx_r = indices_r[iter_r]; + if (idx_l == idx_r) { + // Same row + indices_out[iter_out] = idx_l; + Copy(out[iter_out], data_l[iter_l++], s); + out[iter_out] += data_r[iter_r++]; + num_common_rows++; + } else if (idx_l < idx_r) { + // Left only + indices_out[iter_out] = idx_l; + Copy(out[iter_out], data_l[iter_l++], s); + } else { + // Right only + indices_out[iter_out] = idx_r; + Copy(out[iter_out], data_r[iter_r++], s); + } + iter_out++; + } + // Copying over the rest of the rows + while (iter_l < num_rows_l) { + indices_out[iter_out] = indices_l[iter_l]; + Copy(out[iter_out++], data_l[iter_l++], s); + } + while (iter_r < num_rows_r) { + indices_out[iter_out] = indices_r[iter_r]; + Copy(out[iter_out++], data_r[iter_r++], s); + } + auto new_sshape = TShape(output.aux_shape(rowsparse::kIdx)); + CHECK_GT(new_sshape[0], num_common_rows); + new_sshape[0] -= num_common_rows; + output.set_aux_shape(rowsparse::kIdx, new_sshape); + }); + }); +} + +template +void BinaryComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(inputs.size(), 2); + CHECK_EQ(outputs.size(), 1); + if (typeid(OP) == typeid(mshadow::op::plus)) { + // If any input is dense, fallback to FCompute + // TODO(haibin) implement dns + rsp in a separate kernel + if (common::ContainsDefaultStorage(inputs)) { + FCompExFallback(attrs, ctx, inputs, req, outputs, + BinaryCompute, "BinaryCompute"); + return; + } + CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; + CHECK_EQ(inputs[1].storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; + BinaryComputeRspRspImpl(attrs, ctx, inputs, req, outputs); + return; + } else { + LOG(FATAL) << "Not implemented"; + } +} + template void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -152,6 +268,55 @@ void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs, }); } +// Only implemented for _backward_add for now +template +void BinaryBackwardUseNoneRsp(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage); + CHECK_EQ(outputs[0].storage_type(), kRowSparseStorage); + CHECK_EQ(outputs[1].storage_type(), kRowSparseStorage); + CHECK(typeid(LOP) == typeid(mshadow_op::identity)); + CHECK(typeid(ROP) == typeid(mshadow_op::identity)); + TShape shape = inputs[0].aux_shape(rowsparse::kIdx); + outputs[0].CheckAndAlloc({shape}); + outputs[1].CheckAndAlloc({shape}); + MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, { + MSHADOW_TYPE_SWITCH(outputs[0].aux_type(rowsparse::kIdx), IType, { + auto lgrad_idx = outputs[0].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto rgrad_idx = outputs[1].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto ograd_idx = inputs[0].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto lgrad = outputs[0].data().FlatTo1D(s); + Tensor rgrad = outputs[1].data().FlatTo1D(s); + Tensor ograd = inputs[0].data().FlatTo1D(s); + ASSIGN_DISPATCH(lgrad, req[0], F(ograd)); + ASSIGN_DISPATCH(rgrad, req[1], F(ograd)); + ASSIGN_DISPATCH(lgrad_idx, req[0], F(ograd_idx)); + ASSIGN_DISPATCH(rgrad_idx, req[1], F(ograd_idx)); + }); + }); +} +// Only implemented for _backward_add for now +template +void BinaryBackwardUseNoneEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + auto stype = inputs[0].storage_type(); + CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet"; + BinaryBackwardUseNoneRsp(attrs, ctx, inputs, req, outputs); + // TODO(haibin) fallback for kDefaultStorage +} + template void BinaryBackwardUseNoneWithHalf2(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -232,7 +397,7 @@ void BinaryBackwardUseInWithHalf2(const nnvm::NodeAttrs& attrs, [](const NodeAttrs& attrs){ \ return std::vector >{{0, 0}, {1, 0}}; \ }) \ - .add_argument("lhs", "NDArray-or-Symbol", "first input") \ + .add_argument("lhs", "NDArray-or-Symbol", "first input") \ .add_argument("rhs", "NDArray-or-Symbol", "second input") } // namespace op diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 65d4ca9aadd6..a40d86fdfcd6 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -27,10 +27,19 @@ namespace mxnet { namespace op { MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) -.add_alias("_add").add_alias("_plus").add_alias("_Plus") -.describe("Adds arguments element-wise.") +.add_alias("_add").add_alias("_plus").add_alias("_Plus").add_alias("_sparse_elemwise_add") +.describe(R"code(Adds arguments element-wise. + +The storage type of ``elemwise_add`` output depends on storage types of inputs + +- elemwise_add(row_sparse, row_sparse) = row_sparse +- otherwise, ``elemwise_add`` generates output with default storage + +)code") .set_attr("FCompute", BinaryCompute) -.set_attr("FGradient", CloneGradient{"_backward_add"}); +.set_attr("FGradient", CloneGradient{"_backward_add"}) +.set_attr("FComputeEx", BinaryComputeEx) +.set_attr("FInferStorageType", ElemwiseStorageType<2, 1>); // specialized gradient add function to do add to optimization // this must differ from elemwise_add to prevent add to optimization in forward pass. @@ -46,7 +55,10 @@ NNVM_REGISTER_OP(_backward_add) return std::vector >{{0, 0}, {0, 1}}; }) .set_attr("FCompute", BinaryBackwardUseNone); + mshadow_op::identity>) +.set_attr("FComputeEx", + BinaryBackwardUseNoneEx) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 2>); MXNET_OPERATOR_REGISTER_BINARY(_sub) .add_alias("_minus").add_alias("_Minus") diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 652be72f3fab..f6b6859505f8 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -22,6 +22,7 @@ * \brief elementwise sum operator */ #include "./elemwise_sum.h" +#include "../../ndarray/ndarray_function.h" namespace mxnet { namespace op { @@ -54,14 +55,69 @@ std::vector ElementWiseSumGrad( return ret; } +bool ElementWiseSumShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(out_attrs->size(), 1); + return ElemwiseAttr( + attrs, in_attrs, out_attrs, TShape()); +} + +bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(out_attrs->size(), 1); + return ElemwiseAttr( + attrs, in_attrs, out_attrs, -1); +} + +bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); + CHECK_EQ(out_attrs->size(), 1U); + return ElemwiseStorageAttr( + attrs, in_attrs, out_attrs); +} + +void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK(!inputs.empty()); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; + using namespace mshadow; + Stream* s = ctx.get_stream(); + NDArray out_nd = outputs[0]; + if (inputs[0].storage_type() == kRowSparseStorage) { + mxnet::ndarray::ElementwiseSum(s, inputs, &out_nd); + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, + ElementWiseSumCompute, "ElementWiseSumCompute"); + } +} + NNVM_REGISTER_OP(add_n) .add_alias("ElementWiseSum") +.add_alias("_sparse_add_n") +.add_alias("_sparse_ElementWiseSum") .describe(R"doc(Adds all input arguments element-wise. .. math:: add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n ``add_n`` is potentially more efficient than calling ``add`` by `n` times. + +The storage type of ``add_n`` output depends on storage types of inputs + +- add_n(row_sparse, row_sparse, ..) = row_sparse +- otherwise, ``add_n`` generates output with default storage + )doc" ADD_FILELINE) .set_attr_parser(ParamParser) .set_num_inputs([](const nnvm::NodeAttrs& attrs) { @@ -79,16 +135,16 @@ NNVM_REGISTER_OP(add_n) }) .set_attr("key_var_num_args", "num_args") .set_attr("FCompute", ElementWiseSumCompute) +.set_attr("FComputeEx", ElementWiseSumComputeExCPU) .set_attr( "FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; }) -.set_attr("FInferShape", ElemwiseShape<-1, 1>) -.set_attr("FInferType", ElemwiseType<-1, 1>) -.set_attr("FGradient", CloneGradient{"_backward_add_n"}) +.set_attr("FInferShape", ElementWiseSumShape) +.set_attr("FInferType", ElementWiseSumType) +.set_attr("FInferStorageType", ElementWiseSumForwardInferStorageType) +.set_attr("FGradient", ElementWiseSumGrad) .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments"); - - } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc index defe72d3738c..e94b8bfb9fea 100644 --- a/src/operator/tensor/elemwise_unary_op.cc +++ b/src/operator/tensor/elemwise_unary_op.cc @@ -70,7 +70,9 @@ MXNET_OPERATOR_REGISTER_UNARY(_copy) [](const NodeAttrs& attrs){ return std::vector{true}; }) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1>) .set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityComputeEx) .set_attr("FGradient", ElemwiseGradUseNone{"_copy"}); NNVM_REGISTER_OP(_backward_copy) @@ -85,7 +87,9 @@ NNVM_REGISTER_OP(_backward_copy) [](const NodeAttrs& attrs){ return std::vector{true}; }) -.set_attr("FCompute", IdentityCompute); +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1>) +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityComputeEx); MXNET_OPERATOR_REGISTER_UNARY(BlockGrad) .add_alias("stop_gradient") @@ -162,7 +166,9 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs) .set_attr("FIgnoreInputs", [](const NodeAttrs& attrs) { return std::vector(1, 1); }) .set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityLikeRhsComputeEx) .set_attr("FInferShape", ElemwiseShape<2, 1>) +.set_attr("FInferStorageType", IdentityAttrLikeRhsStorageType) .set_attr( "FGradient", [](const nnvm::NodePtr& n, const std::vector& ograds) { @@ -219,6 +225,7 @@ NNVM_REGISTER_OP(_backward_cast) }) .set_attr("FCompute", CastCompute); + // negative MXNET_OPERATOR_REGISTER_UNARY(negative) .MXNET_DESCRIBE("Numerical negative of the argument, element-wise.") diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu index 4211ea305b4e..f5d711c01a29 100644 --- a/src/operator/tensor/elemwise_unary_op.cu +++ b/src/operator/tensor/elemwise_unary_op.cu @@ -40,7 +40,8 @@ NNVM_REGISTER_OP(_backward_sigmoid) // copy NNVM_REGISTER_OP(_copy) -.set_attr("FCompute", IdentityCompute); +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityComputeEx); NNVM_REGISTER_OP(_backward_copy) .set_attr("FCompute", IdentityCompute); @@ -53,7 +54,9 @@ NNVM_REGISTER_OP(make_loss) // identity output as first input, but attributes are constrainted to be like rhs NNVM_REGISTER_OP(_identity_with_attr_like_rhs) -.set_attr("FCompute", IdentityCompute); +.set_attr("FCompute", IdentityCompute) +.set_attr("FComputeEx", IdentityLikeRhsComputeEx); + NNVM_REGISTER_OP(Cast) .set_attr("FCompute", CastCompute); diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index b6994844e0fe..16477b1973d3 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -31,15 +31,17 @@ #include "../mshadow_op.h" #include "../elemwise_op_common.h" #include "../special_functions-inl.h" +#include "./broadcast_reduce-inl.h" +#include "./init_op.h" namespace mxnet { namespace op { template void UnaryLaunch(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { using namespace mshadow; using namespace mxnet_op; Stream *s = ctx.get_stream(); @@ -95,6 +97,108 @@ void IdentityCompute(const nnvm::NodeAttrs& attrs, }); } +template +void IdentityComputeRspRspImpl(const nnvm::NodeAttrs& attrs, + mshadow::Stream *s, + const NDArray& input, + const OpReqType req, + NDArray* output) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace rowsparse; + if (req == kNullOp) return; + CHECK_EQ(req, kWriteTo) << "kWriteTo is expected for IdentityComputeRspRspImpl"; + if (!input.storage_initialized()) { + FillZerosRspImpl(s, output); + return; + } + TShape shape = input.aux_shape(kIdx); + output->CheckAndAlloc({shape}); + MSHADOW_TYPE_SWITCH(output->dtype(), DType, { + MSHADOW_TYPE_SWITCH(output->aux_type(kIdx), AuxType, { + auto out_d = output->data().FlatTo1D(s); + auto out_aux = output->aux_data(kIdx).FlatTo1D(s); + auto in_aux = input.aux_data(kIdx).FlatTo1D(s); + ASSIGN_DISPATCH(out_d, req, + F(input.data().FlatTo1D(s))); + ASSIGN_DISPATCH(out_aux, req, F(in_aux)); + }); + }); +} + +template +void IdentityComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); + mshadow::Stream *s = ctx.get_stream(); + if (req[0] == kNullOp) return; + if (in_stype == out_stype) { + if (in_stype == kDefaultStorage) { // dense ndarray + IdentityCompute(attrs, ctx, {inputs[0].data()}, req, {outputs[0].data()}); + } else if (in_stype == kRowSparseStorage || in_stype == kCSRStorage) { // sparse ndarray + if (!inputs[0].storage_initialized()) { + FillComputeZerosEx(attrs, ctx, inputs, req, outputs); + return; + } + CHECK_NE(req[0], kAddTo) << "kAddTo is not supported for IdentityComputeEx"; + const size_t n = mxnet::num_aux_data(out_stype); + outputs[0].CheckAndAlloc(inputs[0].aux_shapes()); + IdentityCompute(attrs, ctx, {inputs[0].data()}, req, {outputs[0].data()}); + for (size_t i = 0; i < n; ++i) { + IdentityCompute(attrs, ctx, {inputs[0].aux_data(i)}, req, {outputs[0].aux_data(i)}); + } + } else { + LOG(FATAL) << "IdentityComputeEx does not support input stype = " << in_stype; + } + } else { + FCompExFallback(attrs, ctx, inputs, req, outputs, IdentityCompute, "IdentityCompute"); + } +} + +inline bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + // TODO(junwu): add ctx info into storage inference logic + CHECK_EQ(in_attrs->size(), static_cast(2)) << " in operator " << attrs.name; + CHECK_EQ(out_attrs->size(), static_cast(1)) << " in operator " << attrs.name; + auto &in = *in_attrs; + auto &out = *out_attrs; + CHECK_NE(in[1], kUndefinedStorage) << "rhs storage type must be known"; + if (in[0] == kUndefinedStorage) STORAGE_TYPE_ASSIGN_CHECK(in, 0, in[1]); + if (out[0] == kUndefinedStorage) STORAGE_TYPE_ASSIGN_CHECK(out, 0, in[1]); + return true; +} + +template +void IdentityLikeRhsComputeEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(inputs.size(), 2); + CHECK_EQ(outputs.size(), 1); + Stream *s = ctx.get_stream(); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); + if (in_stype == out_stype) { + std::vector in{inputs[0]}; + IdentityComputeEx(attrs, ctx, in, req, outputs); + } else { + LOG(FATAL) << "IdentityLikeRhsComputeEx not implemented for in_stype = " << in_stype + << " out_stype = " << out_stype; + } +} + struct CastParam : public dmlc::Parameter { // use int for enumeration int dtype; @@ -186,4 +290,5 @@ struct relu_grad { } // namespace op } // namespace mxnet + #endif // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_ diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index e5cb41088e22..8c5d4f5411f8 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -104,7 +104,6 @@ NNVM_REGISTER_OP(_backward_Embedding) .set_attr("TIsBackward", true) .set_attr("FCompute", EmbeddingOpBackward); - NNVM_REGISTER_OP(take) .describe(R"code(Takes elements from an input array along the given axis. diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index ef42b01fb5b6..a9ee408082d4 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -40,6 +40,9 @@ #include "../elemwise_op_common.h" #include "../mxnet_op.h" #include "./sort_op.h" +#include "./dot-inl.h" +#include "./init_op.h" +#include "./matrix_op-inl.h" namespace mxnet { namespace op { diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc index 8dac22a64966..9f333d2d5efe 100644 --- a/src/operator/tensor/init_op.cc +++ b/src/operator/tensor/init_op.cc @@ -39,6 +39,7 @@ NNVM_REGISTER_OP(_zeros) .set_attr("FInferShape", InitShape) .set_attr("FInferType", InitType) .set_attr("FCompute", FillCompute) +.set_attr("FComputeEx", FillComputeZerosEx) .add_arguments(InitOpParam::__FIELDS__()); NNVM_REGISTER_OP(_ones) diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu index 6e2b65cc8519..cbee203c2b31 100644 --- a/src/operator/tensor/init_op.cu +++ b/src/operator/tensor/init_op.cu @@ -27,7 +27,8 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_zeros) -.set_attr("FCompute", FillCompute); +.set_attr("FCompute", FillCompute) +.set_attr("FComputeEx", FillComputeZerosEx); NNVM_REGISTER_OP(_ones) .set_attr("FCompute", FillCompute); diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h index 30a5a3a3af1b..12999b943be4 100644 --- a/src/operator/tensor/init_op.h +++ b/src/operator/tensor/init_op.h @@ -33,6 +33,8 @@ #include #include #include "../elemwise_op_common.h" +#include "../mxnet_op.h" + namespace mxnet { namespace op { @@ -129,7 +131,6 @@ inline bool InitType(const nnvm::NodeAttrs& attrs, return true; } - template void FillCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -145,6 +146,91 @@ void FillCompute(const nnvm::NodeAttrs& attrs, }); } +// Fill in the indices and values of a RowSparse NDArray to represent a zeros NDArray, +// instead of the usual compact representation. +template +inline void FillDnsZerosRspImpl(mshadow::Stream *s, NDArray *dst) { + using namespace rowsparse; + using namespace mshadow::expr; + using namespace mshadow; + using namespace mxnet_op; + CHECK_EQ(dst->storage_type(), kRowSparseStorage); + MSHADOW_REAL_TYPE_SWITCH(dst->dtype(), DType, { + MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, { + auto num_rows = dst->shape()[0]; + dst->CheckAndAlloc({Shape1(num_rows)}); + auto idx = dst->aux_data(kIdx).FlatTo1D(s); + auto val = dst->data(); + Kernel::Launch(s, val.Size(), val.dptr()); + ASSIGN_DISPATCH(idx, kWriteTo, range(0, num_rows, 1, 1)); + }); + }); +} + +struct PopulateFullIdxRspKernel { + template + MSHADOW_XINLINE static void Map(int i, IType* out) { + KERNEL_ASSIGN(out[i], kWriteTo, i); + } +}; + +// Fill full indices NDArray with zeros by updating the aux shape. +template +void PopulateFullIdxRspImpl(mshadow::Stream *s, NDArray *dst) { + using namespace rowsparse; + CHECK_EQ(dst->storage_type(), kRowSparseStorage); + nnvm::dim_t nnr = dst->shape()[0]; + dst->CheckAndAllocAuxData(kIdx, mshadow::Shape1(nnr)); + MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, { + IType* idx = dst->aux_data(kIdx).dptr(); + mxnet_op::Kernel::Launch(s, nnr, idx); + }); +} + +// Fill a rsp NDArray with zeros by updating the aux shape. +template +void FillZerosRspImpl(mshadow::Stream *s, NDArray *dst) { + if (!dst->storage_initialized()) return; + // reset the shapes if it's not zeros + auto storage_shape = dst->storage_shape(); + storage_shape[0] = 0; + dst->set_aux_shape(rowsparse::kIdx, TShape(mshadow::Shape1(0))); +} + +// Fill a CSR NDArray with zeros by updating the aux shape. +template +void FillZerosCsrImpl(mshadow::Stream *s, NDArray *dst) { + if (!dst->storage_initialized()) return; + // reset the shapes if it's not zeros + TShape new_shape(mshadow::Shape1(0)); + dst->set_aux_shape(csr::kIndPtr, new_shape); + dst->set_aux_shape(csr::kIdx, new_shape); +} + +template +void FillComputeZerosEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(outputs.size(), 1); + auto stype = outputs[0].storage_type(); + if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "kWriteTo is expected for FillComputeZerosEx"; + if (stype == kRowSparseStorage) { + NDArray nd(outputs[0]); + FillZerosRspImpl(s, &nd); + } else if (stype == kCSRStorage) { + NDArray nd(outputs[0]); + FillZerosCsrImpl(s, &nd); + } else { + // no fallback is required since the output doesn't depend on input + LOG(FATAL) << "storage type " << stype << " not implemented."; + } +} template void RangeCompute(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index af0de593c1be..4654b37ab2bc 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "../mshadow_op.h" #include "../elemwise_op_common.h" #include "../channel_op_common.h" @@ -368,364 +369,6 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs, return true; } -struct DotParam : public dmlc::Parameter { - bool transpose_a; - bool transpose_b; - DMLC_DECLARE_PARAMETER(DotParam) { - DMLC_DECLARE_FIELD(transpose_a) - .describe("If true then transpose the first input before dot.") - .set_default(false); - DMLC_DECLARE_FIELD(transpose_b) - .describe("If true then transpose the second input before dot.") - .set_default(false); - } -}; - -template -void DotForward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - const DotParam& param = nnvm::get(attrs.parsed); - Stream *s = ctx.get_stream(); - CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) { - CHECK_NE(req[0], kAddTo) << "AddTo not yet suported"; - Tensor out = outputs[0].get(s); - VectorDot(out, - inputs[0].get(s), - inputs[1].get(s)); - } else { - int ma, na, mb, nb, m, n; - if (param.transpose_a) { - ma = inputs[0].size(0); - na = inputs[0].Size()/ma; - m = na; - } else { - na = inputs[0].size(inputs[0].ndim()-1); - ma = inputs[0].Size()/na; - m = ma; - } - if (param.transpose_b) { - nb = inputs[1].size(inputs[1].ndim()-1); - mb = inputs[1].Size()/nb; - n = mb; - } else { - mb = inputs[1].size(0); - nb = inputs[1].Size()/mb; - n = nb; - } - Tensor input0 = - inputs[0].get_with_shape(Shape2(ma, na), s); - Tensor input1 = - inputs[1].get_with_shape(Shape2(mb, nb), s); - Tensor out = - outputs[0].get_with_shape(Shape2(m, n), s); - if (param.transpose_a && param.transpose_b) { - ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T())); - } else if (!param.transpose_a && param.transpose_b) { - ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T())); - } else if (param.transpose_a && !param.transpose_b) { - ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1)); - } else { - ASSIGN_DISPATCH(out, req[0], dot(input0, input1)); - } - } - }); -} - -template -void DotBackward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - const DotParam& param = nnvm::get(attrs.parsed); - Stream *s = ctx.get_stream(); - CHECK_NE(req[0], kWriteInplace); - CHECK_NE(req[1], kWriteInplace); - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) { - Tensor mout_grad = inputs[0].get(s); - Tensor mlhs_data = inputs[1].get(s); - Tensor mrhs_data = inputs[2].get(s); - Tensor mlhs_grad = outputs[0].get(s); - Tensor mrhs_grad = outputs[1].get(s); - ASSIGN_DISPATCH(mrhs_grad, req[1], - broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data); - ASSIGN_DISPATCH(mlhs_grad, req[0], - broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data); - } else { - int ma, na, mb, nb, m, n; - if (param.transpose_a) { - ma = outputs[0].size(0); - na = outputs[0].Size()/ma; - m = na; - } else { - na = outputs[0].size(outputs[0].ndim()-1); - ma = outputs[0].Size()/na; - m = ma; - } - if (param.transpose_b) { - nb = outputs[1].size(outputs[1].ndim()-1); - mb = outputs[1].Size()/nb; - n = mb; - } else { - mb = outputs[1].size(0); - nb = outputs[1].Size()/mb; - n = nb; - } - Tensor mout_grad = - inputs[0].get_with_shape(Shape2(m, n), s); - Tensor mlhs_data = - inputs[1].get_with_shape(Shape2(ma, na), s); - Tensor mrhs_data = - inputs[2].get_with_shape(Shape2(mb, nb), s); - Tensor mlhs_grad = - outputs[0].get_with_shape(Shape2(ma, na), s); - Tensor mrhs_grad = - outputs[1].get_with_shape(Shape2(mb, nb), s); - if (param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x.T, y.T) - // dy = dot(x, dz).T = dot(dz.T, x.T) - // dx = dot(dz, y).T = dot(y.T, dz.T) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T())); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T())); - } else if (!param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x, y.T) - // dy = dot(x.T, dz).T = dot(dz.T, x) - // dx = dot(dz, y) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data)); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data)); - } else if (param.transpose_a && !param.transpose_b) { - // Gradient of z = dot(x.T, y) - // dy = dot(x, dz) - // dx = dot(dz, y.T).T = dot(y, dz.T) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad)); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T())); - } else { - // Gradient of z = dot(x, y) - // dy = dot(x.T, dz) - // dx = dot(dz, y.T) - ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad)); - ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T())); - } - } - }); -} - -inline bool DotShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const DotParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), 2U); - CHECK_EQ(out_attrs->size(), 1U); - TShape& lshape = (*in_attrs)[0]; - TShape& rshape = (*in_attrs)[1]; - if (lshape.ndim() == 1 && rshape.ndim() == 1) { - CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors"; - CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1)); - } else { - bool Ta = param.transpose_a, Tb = param.transpose_b; - TShape L[2], R[2]; - if (Ta) { - L[0] = mshadow::Shape1(lshape[0]); - L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1); - } else { - L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1); - L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]); - } - if (Tb) { - R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1); - R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]); - } else { - R[0] = mshadow::Shape1(rshape[0]); - R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1); - } - - if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) { - CHECK_EQ(L[!Ta].Size(), R[Tb].Size()) - << "dot shape error: " << lshape << " X " << rshape; - } - std::vector buf; - if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]); - if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]); - TShape oshape(buf.begin(), buf.end()); - SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape); - } - return true; -} - -template -void BatchDotForward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - mshadow::Stream *s = ctx.get_stream(); - const DotParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_) - << "Binary function only support input/output with the same type"; - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - mshadow::Tensor out = outputs[0].get(s); - mshadow::Tensor mlhs = inputs[0].get(s); - mshadow::Tensor mrhs = inputs[1].get(s); - mshadow::Tensor workspace = - ctx.requested[0].get_space_typed(mshadow::Shape1(3 * out.size(0)), s); - if (kNullOp != req[0]) { - if (param.transpose_a && param.transpose_b) { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } else if (!param.transpose_a && param.transpose_b) { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } else if (param.transpose_a && !param.transpose_b) { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } else { - mshadow::BatchGEMM(out, mlhs, mrhs, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - workspace); - } - } - }); -} - -template -void BatchDotBackward_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - mshadow::Stream *s = ctx.get_stream(); - const DotParam& param = nnvm::get(attrs.parsed); - CHECK_NE(req[1], kWriteInplace); - CHECK_NE(req[0], kWriteInplace); - CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64) - << "dot only supports float32 and float64"; - MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { - mshadow::Tensor mout_grad = inputs[0].get(s); - mshadow::Tensor mlhs_data = inputs[1].get(s); - mshadow::Tensor mrhs_data = inputs[2].get(s); - mshadow::Tensor mlhs_grad = outputs[0].get(s); - mshadow::Tensor mrhs_grad = outputs[1].get(s); - mshadow::Tensor workspace = - ctx.requested[0].get_space_typed( - mshadow::Shape2(2, 3 * mout_grad.size(0)), s); - mshadow::Tensor rhs_workspace = workspace[0]; - mshadow::Tensor lhs_workspace = workspace[1]; - if (param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x.T, y.T) - // dy = dot(x, dz).T = dot(dz.T, x.T) - // dx = dot(dz, y).T = dot(y.T, dz.T) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } else if (!param.transpose_a && param.transpose_b) { - // Gradient of z = dot(x, y.T) - // dy = dot(x.T, dz).T = dot(dz.T, x) - // dx = dot(dz, y) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } else if (param.transpose_a && !param.transpose_b) { - // Gradient of z = dot(x.T, y) - // dy = dot(x, dz) - // dx = dot(dz, y.T).T = dot(y, dz.T) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } else { - // Gradient of z = dot(x, y) - // dy = dot(x.T, dz) - // dx = dot(dz, y.T) - if (kNullOp != req[1]) { - mshadow::BatchGEMM(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f, - (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f, - rhs_workspace); - } - if (kNullOp != req[0]) { - mshadow::BatchGEMM(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f, - (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f, - lhs_workspace); - } - } - }); -} - -inline bool BatchDotShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 2U); - CHECK_EQ(out_attrs->size(), 1U); - const DotParam& param = nnvm::get(attrs.parsed); - TShape& lshape = (*in_attrs)[0]; - TShape& rshape = (*in_attrs)[1]; - if (lshape.ndim() == 3 && rshape.ndim() == 3) { - CHECK(lshape[0] == rshape[0]) - << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape - << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; - index_t out_m = param.transpose_a ? lshape[2] : lshape[1]; - index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2]; - index_t out_n = param.transpose_b ? rshape[1] : rshape[2]; - index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1]; - CHECK(lshape_k == rshape_k) - << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape - << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n)); - } else { - LOG(FATAL) << "batch_dot currently only support 3D*3D array" - << lshape << " v.s. " << rshape; - } - return true; -} - struct SliceParam : public dmlc::Parameter { nnvm::Tuple > begin, end; DMLC_DECLARE_PARAMETER(SliceParam) { @@ -845,6 +488,96 @@ void Slice(const nnvm::NodeAttrs& attrs, }); } +// slice the indptr of a csr +struct SliceCsrIndPtr { + template + MSHADOW_XINLINE static void Map(int i, IType* out, const IType* in, const IType* base) { + KERNEL_ASSIGN(out[i], kWriteTo, in[i] - *base); + } +}; + +/* + * a wrapper to launch SliceCsrIndPtr kernel. + * slice [src[begin] .. src[end]) and store in dst[0, end - begin) + */ +template +void SliceCsrIndPtrImpl(const int begin, const int end, RunContext ctx, + const IType* src, IType* dst) { + using namespace mshadow; + using namespace mxnet_op; + Stream *s = ctx.get_stream(); + int indptr_len = end - begin + 1; + Kernel::Launch(s, indptr_len, dst, src + begin, src + begin); +} + +/* + * Slice a CSR NDArray + * Only implemented for CPU + */ +template +void SliceCsrImpl(const SliceParam ¶m, const OpContext& ctx, + const NDArray &in, OpReqType req, const NDArray &out) { + using namespace mshadow; + using namespace mxnet_op; + using namespace csr; + CHECK((std::is_same::value)) << "Slice for CSR input only implemented for CPU"; + if (req == kNullOp) return; + CHECK_NE(req, kAddTo) << "kAddTo for Slice on CSR input is not supported"; + CHECK_NE(req, kWriteInplace) << "kWriteInplace for Slice on CSR input is not supported"; + Stream *s = ctx.get_stream(); + int begin = *param.begin[0]; + int end = *param.end[0]; + int indptr_len = end - begin + 1; + out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len)); + if (!in.storage_initialized()) { + out.set_aux_shape(kIndPtr, Shape1(0)); + return; + } + // assume idx indptr share the same type + MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, { + MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, { + MSHADOW_TYPE_SWITCH(in.dtype(), DType, { + auto in_indptr = in.aux_data(kIndPtr).dptr(); + auto out_indptr = out.aux_data(kIndPtr).dptr(); + SliceCsrIndPtrImpl(begin, end, ctx.run_ctx, in_indptr, out_indptr); + + // retrieve nnz (CPU implementation) + int nnz = out_indptr[indptr_len - 1]; + // copy indices and values + out.CheckAndAllocAuxData(kIdx, Shape1(nnz)); + out.CheckAndAllocData(Shape1(nnz)); + auto in_idx = in.aux_data(kIdx).dptr(); + auto out_idx = out.aux_data(kIdx).dptr(); + auto in_data = in.data().dptr(); + auto out_data = out.data().dptr(); + int offset = in_indptr[begin]; + // this is also a CPU-only implementation + memcpy(out_idx, in_idx + offset, nnz * sizeof(IType)); + memcpy(out_data, in_data + offset, nnz * sizeof(DType)); + }); + }); + }); +} + +template +void SliceEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1); + CHECK_EQ(outputs.size(), 1); + const SliceParam& param = nnvm::get(attrs.parsed); + auto in_stype = inputs[0].storage_type(); + CHECK_NE(in_stype, kDefaultStorage) + << "SliceEx is not expected to execute for input with default storage type"; + if (in_stype == kCSRStorage) { + SliceCsrImpl(param, ctx, inputs[0], req[0], outputs[0]); + } else { + LOG(FATAL) << "Slice not implemented for storage type" << in_stype; + } +} + inline bool SliceAssignShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index e7e8f5548a1c..d409b9ec6056 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -34,7 +34,6 @@ DMLC_REGISTER_PARAMETER(ClipParam); DMLC_REGISTER_PARAMETER(SimpleCropAssignScalarParam); DMLC_REGISTER_PARAMETER(SliceParam); DMLC_REGISTER_PARAMETER(SliceAxisParam); -DMLC_REGISTER_PARAMETER(DotParam); DMLC_REGISTER_PARAMETER(RepeatParam); DMLC_REGISTER_PARAMETER(TileParam); DMLC_REGISTER_PARAMETER(ReverseParam); @@ -263,6 +262,9 @@ and ``end=(e_1, e_2, ... e_n)`` indices will result in an array with the shape The resulting array's *k*-th dimension contains elements from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``. +For an input array of non-default storage type(e.g. `csr` or `row_sparse`), it only supports +slicing on the first dimension. + Example:: x = [[ 1., 2., 3., 4.], @@ -276,8 +278,10 @@ Example:: .set_attr_parser(ParamParser) .set_attr("FInferShape", SliceShape) .set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1>) .set_attr("FGradient", ElemwiseGradUseNone{"_backward_slice"}) .set_attr("FCompute", Slice) +.set_attr("FComputeEx", SliceEx) .add_argument("data", "NDArray-or-Symbol", "Source input") .add_arguments(SliceParam::__FIELDS__()); @@ -370,94 +374,6 @@ NNVM_REGISTER_OP(_backward_slice_axis) .set_attr("TIsBackward", true) .set_attr("FCompute", SliceAxisGrad_); -NNVM_REGISTER_OP(dot) -.describe(R"doc(Dot product of two arrays. - -``dot``'s behavior depends on the input array dimensions: - -- 1-D arrays: inner product of vectors -- 2-D arrays: matrix multiplication -- N-D arrays: a sum product over the last axis of the first input and the first - axis of the second input - - For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the - result array will have shape `(n,m,r,s)`. It is computed by:: - - dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b]) - - Example:: - - x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2)) - y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2)) - dot(x,y)[0,0,1,1] = 0 - sum(x[0,0,:]*y[:,1,1]) = 0 -)doc" ADD_FILELINE) -.set_num_inputs(2) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"lhs", "rhs"}; - }) -.set_attr("FInferShape", DotShape) -.set_attr("FInferType", ElemwiseType<2, 1>) -.set_attr("FCompute", DotForward_) -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_dot"}) -.add_argument("lhs", "NDArray-or-Symbol", "The first input") -.add_argument("rhs", "NDArray-or-Symbol", "The second input") -.add_arguments(DotParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_dot) -.set_num_inputs(3) -.set_num_outputs(2) -.set_attr_parser(ParamParser) -.set_attr("TIsBackward", true) -.set_attr("FCompute", DotBackward_) -.add_arguments(DotParam::__FIELDS__()); - -NNVM_REGISTER_OP(batch_dot) -.describe(R"doc(Batchwise dot product. - -``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and -``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`. - -For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape -`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`, -which is computed by:: - - batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:]) - -)doc" ADD_FILELINE) -.set_num_inputs(2) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"lhs", "rhs"}; - }) -.set_attr("FInferShape", BatchDotShape) -.set_attr("FInferType", ElemwiseType<2, 1>) -.set_attr("FResourceRequest", - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace}; - }) -.set_attr("FCompute", BatchDotForward_) -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"}) -.add_argument("lhs", "NDArray-or-Symbol", "The first input") -.add_argument("rhs", "NDArray-or-Symbol", "The second input") -.add_arguments(DotParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_batch_dot) -.set_num_inputs(3) -.set_num_outputs(2) -.set_attr_parser(ParamParser) -.set_attr("FResourceRequest", - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace}; - }) -.set_attr("TIsBackward", true) -.set_attr("FCompute", BatchDotBackward_); - NNVM_REGISTER_OP(clip) .describe(R"code(Clips (limits) the values in an array. diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu index ca40419a9367..3cf2a7a753d0 100644 --- a/src/operator/tensor/matrix_op.cu +++ b/src/operator/tensor/matrix_op.cu @@ -57,18 +57,6 @@ NNVM_REGISTER_OP(slice_axis) NNVM_REGISTER_OP(_backward_slice_axis) .set_attr("FCompute", SliceAxisGrad_); -NNVM_REGISTER_OP(dot) -.set_attr("FCompute", DotForward_); - -NNVM_REGISTER_OP(_backward_dot) -.set_attr("FCompute", DotBackward_); - -NNVM_REGISTER_OP(batch_dot) -.set_attr("FCompute", BatchDotForward_); - -NNVM_REGISTER_OP(_backward_batch_dot) -.set_attr("FCompute", BatchDotBackward_); - NNVM_REGISTER_OP(clip) .set_attr("FCompute", Clip); diff --git a/src/operator/tensor/sparse_retain-inl.h b/src/operator/tensor/sparse_retain-inl.h new file mode 100644 index 000000000000..5add57c83b24 --- /dev/null +++ b/src/operator/tensor/sparse_retain-inl.h @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file sparse_retain-inl.h + * \brief +*/ +#ifndef MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_ +#define MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_ + +#include +#include +#include +#include "./init_op.h" +#include "../mshadow_op.h" +#include "../elemwise_op_common.h" +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +/*! + * \brief sparse retain namespace + */ +namespace sr { +enum SparseRetainOpInputs {kArr, kIdx}; +enum SparseRetainOpOutputs {kOut}; +} // namespace sr + +inline bool SparseRetainOpShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U) + << "sparse_retain operator takes 2 arguments (" << in_attrs->size() << " given)"; + CHECK_EQ(out_attrs->size(), 1U); + + TShape tshape((*in_attrs)[sr::kArr]); + shape_assign(&tshape, (*out_attrs)[sr::kOut]); + SHAPE_ASSIGN_CHECK(*in_attrs, sr::kArr, tshape); + SHAPE_ASSIGN_CHECK(*out_attrs, sr::kOut, tshape); + return true; +} + +inline bool SparseRetainOpType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + CHECK_NE((*in_attrs)[sr::kIdx], -1) << "Index type must be set for sparse_retain operator"; + + TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[sr::kArr]); + TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[sr::kOut]); + return (*in_attrs)[0] != -1; +} + +inline bool SparseRetainForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + type_assign(&(in_attrs->at(sr::kArr)), kRowSparseStorage); + type_assign(&(in_attrs->at(sr::kIdx)), kDefaultStorage); + type_assign(&(out_attrs->at(sr::kOut)), kRowSparseStorage); + return true; +} + +inline bool SparseRetainBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 2U); + + type_assign(&(in_attrs->at(sr::kOut)), kDefaultStorage); + type_assign(&(in_attrs->at(sr::kIdx)), kDefaultStorage); + type_assign(&(out_attrs->at(sr::kArr)), kRowSparseStorage); + type_assign(&(out_attrs->at(sr::kIdx)), kDefaultStorage); + return true; +} + +/*! + * \brief Each thread searches for a user input index in the input + * row sparse ndarray alternatively. This ensures each thread + * has the almost the same workload. The overhead is the binary + * search. If all the indices of the idx array are contained + * in the in_idx, one should use SparseRetainRspRowBlockKernel instead, + * where each thread only perform binary search once. + */ +struct SparseRetainRspThreadKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx, + const DType* in_data, const RType* in_idx, + const IType* idx, const size_t nnr, + const size_t row_length) { + const RType irow = idx[i]; + int j = -1, left = 0, right = nnr - 1; + while (left <= right) { + int m = left + (right - left) / 2; + const auto in_idx_m = in_idx[m]; + if (in_idx_m == irow) { + j = m; + break; + } else if (in_idx_m < irow) { + left = m + 1; + } else { + right = m - 1; + } + } + out_idx[i] = idx[i]; + if (j >= 0) { + const size_t in_offset = j * row_length; + const size_t out_offset = i * row_length; + for (size_t k = 0; k < row_length; ++k) { + out_data[out_offset+k] = in_data[in_offset+k]; + } + } + } +}; + +/*! + * \brief This kernel should be invoked when the row indices + * to be retained are all in the input rsp. + * Each thread searches for a subarray of indices of + * the user-input idx array for retain. The first index + * in the subarray will be searched for using binary search. + * The rest of the indices will be searched for starting from + * the lower bound of the binary search. This kernel assumes + * that idx has been sorted in ascending order. + */ +struct SparseRetainRspRowBlockKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx, + const DType* in_data, const RType* in_idx, + const IType* idx, const size_t num_indices, + const size_t nnr, const size_t row_length, + const size_t seg_len) { + const size_t seg_start = i * seg_len; + if (seg_start >= num_indices) return; + const size_t seg_end = (seg_start+seg_len < num_indices? seg_start+seg_len : num_indices); + for (size_t j = seg_start; j < seg_end; ++j) { + out_idx[j] = idx[j]; + } + // use binary search to find the lower bound of idx[seg_start] in in_idx + const RType* first = in_idx; + const RType* last = in_idx + nnr; + const auto val = idx[seg_start]; + const RType* it; + int count = last - first, step; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (*it < val) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + size_t cur_row_idx = first - in_idx; + // end of binary search + if (cur_row_idx == nnr || in_idx[cur_row_idx] > idx[seg_end-1]) { + return; + } + size_t cur_idx = seg_start; + while (cur_row_idx < nnr && cur_idx < seg_end) { + if (in_idx[cur_row_idx] == idx[cur_idx]) { + const size_t in_offset = cur_row_idx * row_length; + const size_t out_offset = cur_idx * row_length; + for (size_t k = 0; k < row_length; ++k) { + out_data[out_offset+k] = in_data[in_offset+k]; + } + ++cur_row_idx; + ++cur_idx; + } else if (in_idx[cur_row_idx] < idx[cur_idx]) { + ++cur_row_idx; + } else { + ++cur_idx; + } + } + } +}; + +/*! + * Copy input indices to output indices. + * Only used when input rsp is dense. + */ +struct SparseRetainCopyIndices { + template + MSHADOW_XINLINE static void Map(int i, RType* out_idx, IType* idx) { + out_idx[i] = idx[i]; + } +}; + +/*! + * Copy input retained rows to output rows. + * Only used when input rsp is dense. + * This kernel is only used when ctx is on GPU. + * So it's parallelized by out_rows' elements, + * instead of rows. + * For CPU ctx, we simply call mshadow::Copy. + */ +struct SparseRetainCopyRetainedRowsFromDns { + template + MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows, + const RType* in_row_idx, const IType* idx, + const size_t row_length) { + const size_t irow = i / row_length; + const size_t icol = i % row_length; + out_rows[i] = in_rows[static_cast(idx[irow]) * row_length + icol]; + } +}; + +template +void SparseRetainOpForwardRspImpl(mshadow::Stream *s, + const NDArray& input_nd, + const TBlob& idx_data, + const OpReqType req, + NDArray* output_nd) { + if (req == kNullOp) return; + CHECK_EQ(req, kWriteTo) << "SparseRetainOpForwardRspImpl only support req = kWriteTo now"; + CHECK_EQ(input_nd.storage_type(), kRowSparseStorage) + << "SparseRetainOpForwardRspImpl operator only takes row sparse NDArray as input"; + CHECK_EQ(output_nd->storage_type(), kRowSparseStorage) + << "SparseRetainOpForwardRspImpl operator only outputs row sparse NDArray"; + + if (!input_nd.storage_initialized() + || idx_data.Size() == 0U + || input_nd.shape()[0] == 0) { + FillZerosRspImpl(s, output_nd); + return; + } + + const TBlob input_data = input_nd.data(); + const TBlob input_idx = input_nd.aux_data(rowsparse::kIdx); + + output_nd->CheckAndAlloc({mshadow::Shape1(idx_data.Size())}); + TBlob output_data = output_nd->data(); + TBlob output_idx = output_nd->aux_data(rowsparse::kIdx); + const auto row_length = input_data.shape_.ProdShape(1, input_data.shape_.ndim()); + + using namespace mxnet_op; + MSHADOW_TYPE_SWITCH(output_data.type_flag_, DType, { // output data type + Kernel::Launch(s, output_data.Size(), output_data.dptr()); + MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, { // row index data type + MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, { // index array data type + if (input_idx.Size() == input_nd.shape()[0]) { // input rsp is dense + using namespace mshadow; + // copy indices + Tensor output_idx_tensor = output_idx.FlatTo1D(s); + const size_t num_rows_retained = output_idx.Size(); + if (output_idx.type_flag_ == idx_data.type_flag_) { // same type, use Copy + const Tensor idx_tensor = idx_data.FlatTo1D(s); + Copy(output_idx_tensor, idx_tensor, s); + } else { // different index types, use Kernel::Launch + Kernel::Launch(s, num_rows_retained, + output_idx.dptr(), idx_data.dptr()); + } + // copy data + if (std::is_same::value) { // For cpu, we can access output_idx_tensor[i] + const Tensor input_tensor = + input_data.get_with_shape(Shape2(input_data.shape_[0], row_length), s); + Tensor output_tensor = + output_data.get_with_shape(Shape2(output_data.shape_[0], row_length), + s); + for (size_t i = 0; i < num_rows_retained; ++i) { + Copy(output_tensor[i], input_tensor[output_idx_tensor[i]], s); + } + } else { // For gpu, have to kernel launch + Kernel::Launch(s, output_data.Size(), + output_data.dptr(), input_data.dptr(), input_idx.dptr(), + idx_data.dptr(), row_length); + } + } else { // input rsp is not dense + Kernel::Launch(s, idx_data.Size(), + output_data.dptr(), output_idx.dptr(), input_data.dptr(), + input_idx.dptr(), idx_data.dptr(), input_data.shape_[0], row_length); + } + }); + }); + }); +} + +template +void SparseRetainOpForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + if (req[sr::kOut] == kNullOp) return; + CHECK_EQ(req[sr::kOut], kWriteTo) << "sparse_retain only supports req=\'write\'"; + CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage) + << "sparse_retain operator only takes default NDArray as its index array"; + if (inputs[sr::kArr].storage_type() == kRowSparseStorage) { + NDArray output_nd = outputs[sr::kOut]; + SparseRetainOpForwardRspImpl(ctx.get_stream(), inputs[sr::kArr], + inputs[sr::kIdx].data(), req[sr::kOut], &output_nd); + } else { + LOG(FATAL) << "sparse_retain op only supports row-sparse ndarrays as input"; + } +} + +template +struct SparseRetainRspGradKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* in_grad, RType* in_grad_idx, + const DType* out_grad, const IType* idx, + const size_t row_length) { + const RType irow = idx[i]; + in_grad_idx[i] = irow; + const size_t out_offset = irow * row_length; + const size_t in_offset = i * row_length; + for (size_t j = 0; j < row_length; ++j) { + KERNEL_ASSIGN(in_grad[in_offset+j], req, out_grad[out_offset+j]); + } + } +}; + +template +void SparseRetainOpBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(req.size(), 2U); + CHECK_EQ(req[sr::kIdx], kNullOp); + if (req[sr::kArr] == kNullOp) return; + CHECK_EQ(req[sr::kArr], kWriteTo); + + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 2U) + << "sparse_retain does not support calculating gradients of indices"; + + CHECK_EQ(inputs[sr::kOut].storage_type(), kDefaultStorage) + << "sparse_retain backward only takes default NDArray as ograd"; + CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage) + << "sparse_retain backward only takes default NDArray as its index array"; + CHECK_EQ(outputs[sr::kArr].storage_type(), kRowSparseStorage) + << "sparse_retain backward only outputs row sparse NDArray as grad of input"; + + using namespace mxnet_op; + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TBlob idx_data = inputs[sr::kIdx].data(); + if (idx_data.Size() == 0U) { + NDArray output = outputs[sr::kArr]; + FillZerosRspImpl(s, &output); + return; + } + + const TBlob out_grad_data = inputs[sr::kOut].data(); + + NDArray in_grad_nd = outputs[sr::kArr]; + in_grad_nd.CheckAndAlloc({mshadow::Shape1(idx_data.Size())}); + TBlob in_grad_data = in_grad_nd.data(); + TBlob in_grad_idx = in_grad_nd.aux_data(rowsparse::kIdx); + const auto row_length = out_grad_data.shape_.ProdShape(1, out_grad_data.shape_.ndim()); + + MSHADOW_TYPE_SWITCH(out_grad_data.type_flag_, DType, { // output data type + MSHADOW_IDX_TYPE_SWITCH(in_grad_idx.type_flag_, RType, { // row index data type + MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, { // index array data type + MXNET_ASSIGN_REQ_SWITCH(req[sr::kArr], req_type, { + Kernel, xpu>::Launch( + s, in_grad_idx.Size(), in_grad_data.dptr(), in_grad_idx.dptr(), + out_grad_data.dptr(), idx_data.dptr(), row_length); + }); + }); + }); + }); +} + + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_ diff --git a/src/operator/tensor/sparse_retain.cc b/src/operator/tensor/sparse_retain.cc new file mode 100644 index 000000000000..f8fc325c0534 --- /dev/null +++ b/src/operator/tensor/sparse_retain.cc @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file sparse_retain.cc + * \brief +*/ + +#include "./sparse_retain-inl.h" +namespace mxnet { +namespace op { + +// Add prefix "_sparse_" to prevent it from being registered +// under mxnet.ndarray in python frontend as this op only +// accepts row-sparse format ndarrays. It will be registered +// under mxnet.ndarray.sparse with name retain. +NNVM_REGISTER_OP(_sparse_retain) +.describe(R"code(pick rows specified by user input index array from a row sparse matrix +and save them in the output sparse matrix. + +Example:: + + data = [[1, 2], [3, 4], [5, 6]] + indices = [0, 1, 3] + shape = (4, 2) + rsp_in = row_sparse(data, indices) + to_retain = [0, 3] + rsp_out = retain(rsp_in, to_retain) + rsp_out.values = [[1, 2], [5, 6]] + rsp_out.indices = [0, 3] + +The storage type of ``retain`` output depends on storage types of inputs + +- retain(row_sparse, default) = row_sparse +- otherwise, ``retain`` is not supported + +)code" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "indices"}; + }) +.set_attr("FInferShape", SparseRetainOpShape) +.set_attr("FInferType", SparseRetainOpType) +.set_attr("FInferStorageType", SparseRetainForwardInferStorageType) +.set_attr("FComputeEx", SparseRetainOpForwardEx) +.set_attr("FGradient", + [](const nnvm::NodePtr& n, const std::vector& ograds) { + return MakeNonlossGradNode("_backward_sparse_retain", n, ograds, + {n->inputs[sr::kIdx]}, n->attrs.dict); + }) +.add_argument("data", "NDArray-or-Symbol", "The input array for sparse_retain operator.") +.add_argument("indices", "NDArray-or-Symbol", "The index array of rows ids that will be retained."); + +NNVM_REGISTER_OP(_backward_sparse_retain) +.set_num_inputs(2) +.set_num_outputs(2) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", SparseRetainBackwardInferStorageType) +.set_attr("FComputeEx", SparseRetainOpBackwardEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/sparse_retain.cu b/src/operator/tensor/sparse_retain.cu new file mode 100644 index 000000000000..6b4ac1bdf1a1 --- /dev/null +++ b/src/operator/tensor/sparse_retain.cu @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file sparse_retain.cu + * \brief +*/ + +#include "./sparse_retain-inl.h" +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(_sparse_retain) +.set_attr("FComputeEx", SparseRetainOpForwardEx); + +NNVM_REGISTER_OP(_backward_sparse_retain) +.set_attr("FComputeEx", SparseRetainOpBackwardEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h new file mode 100644 index 000000000000..beb77c37b8d2 --- /dev/null +++ b/src/operator/tensor/square_sum-inl.h @@ -0,0 +1,456 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file square_sum-inl.h + * \brief This is a temporary solution for fusing operators + * square and sum together as a composite op for row sparse tensors. + * The purpose for fusing square and sum for row sparse tensors + * is that the gradient of the fused operator depends on the input + * ndarray and thus its gradient is a row-sparse ndarray too. + * This fused op will become deprecated after the functionality + * of fusing operators is finished in the future. + */ + +#ifndef MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ +#define MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ + +#include +#include +#include +#include "../mxnet_op.h" +#include "./broadcast_reduce_op.h" + +namespace mxnet { +namespace op { + +inline bool SquareSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + if (in_attrs->at(0) == kRowSparseStorage) { // current impl + if (param.axis[0] == 1 && param.keepdims) { // sum per row and keep dims + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); + } else { + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage); + } + } else { // fallback + type_assign(&((*in_attrs)[0]), kDefaultStorage); + type_assign(&((*out_attrs)[0]), kDefaultStorage); + } + return true; +} + +inline bool SquareSumBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const Context& ctx, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + if (in_attrs->at(0) == kDefaultStorage || in_attrs->at(0) == kRowSparseStorage) { + STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kRowSparseStorage); + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); + } else { // fallback + type_assign(&((*in_attrs)[0]), kDefaultStorage); + type_assign(&((*in_attrs)[1]), kDefaultStorage); + type_assign(&((*out_attrs)[0]), kDefaultStorage); + } + return true; +} + +/*! + * \brief square sum of a rsp + * if axis = -1, same as mx.nd.sum(tensor*tensor) + * if axis = 0, same as mx.nd.sum(tensor*tensor, axis=0) + * if axis = 1, same as mx.nd.sum(tensor*tensor, axis=1) + * where tensor*tensor is elemwise multiplication of two ndarrays. + */ +template +struct SquareSumRspKernel; + +/*! + * \brief square sum of a rsp on axis=0 without keeping the dim + */ +template +struct SquareSumRspKernel { + /*! + * \param j the element index in out_data and column id of in_data + */ + template + MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data, + const int64_t nnr, const int64_t num_cols) { + DType sum = 0; + for (int64_t i = 0; i < nnr; ++i) { + const DType val = in_data[i*num_cols+j]; + sum += val * val; + } + KERNEL_ASSIGN(out_data[j], req, sum); + } +}; + +/*! + * \brief square sum of a rsp on axis=1 without keeping the dim + */ +template +struct SquareSumRspKernel { + /*! + * \param i the i-th non-zero row of in_data + */ + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, const IType* in_row_idx, + const DType* in_data, const int64_t num_cols) { + DType sum = 0; + const int64_t offset = i * num_cols; + for (int64_t j = 0; j < num_cols; ++j) { + const DType val = in_data[offset+j]; + sum += val * val; + } + KERNEL_ASSIGN(out_data[in_row_idx[i]], req, sum); + } +}; + +/*! + * \brief square sum of a rsp on axis=1 keeping the dim + */ +template +struct SquareSumRspKernel { + /*! + * \param i the i-th non-zero row of in_data + */ + template + MSHADOW_XINLINE static void Map(int i, IType* out_row_idx, DType* out_data, + const IType* in_row_idx, const DType* in_data, + const int64_t num_cols) { + DType sum = 0; + out_row_idx[i] = in_row_idx[i]; + const int64_t offset = i * num_cols; + for (int64_t j = 0; j < num_cols; ++j) { + const DType val = in_data[offset+j]; + sum += val * val; + } + KERNEL_ASSIGN(out_data[i], req, sum); + } +}; + +template +struct SquareSumRspGradKernel; + +template +struct SquareSumRspGradKernel { + /*! + * \param i element index in in_grad and in_data + * \param in_grad_row_idx row_idx of the gradient of the op's input + * \param in_grad gradient of the op's input + * \param out_grad gradient of the op's output + * \param in_row_idx row idx of the op's input + * \param in_data op's input + */ + template + MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, + const DType* out_grad, const IType* in_row_idx, + const DType* in_data, const int64_t num_cols) { + const int64_t row = i / num_cols; + in_grad_row_idx[row] = in_row_idx[row]; + KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[i%num_cols]); + } +}; + +template +struct SquareSumRspGradKernel { + /*! + * \param i element index in in_grad and in_data + * \param in_grad_row_idx row_idx of the gradient of the op's input + * \param in_grad gradient of the op's input + * \param out_grad gradient of the op's output + * \param in_row_idx row idx of the op's input + * \param in_data op's input + */ + template + MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, + const DType* out_grad, const IType* in_row_idx, + const DType* in_data, const int64_t num_cols) { + const int64_t row = i / num_cols; + in_grad_row_idx[row] = in_row_idx[row]; + KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[in_row_idx[row]]); + } +}; + +/*! + * Note: This kernel assumes that the ograd and in_data + * are all rsp and have equal row_idx array, or + * in_data is a full rsp. + */ +template +struct SquareSumRspGradKernel { + /*! + * \param i index of igrad.data() + * \param in_grad_row_idx row_idx of the gradient of the op's input + * \param in_grad gradient of the op's input + * \param out_grad_row_idx row_idx of the gradient of the op's output + * \param out_grad gradient of the op's output + * \param in_data op's input + */ + template + MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, + const IType* out_grad_row_idx, const DType* out_grad, + const DType* in_data, const int64_t num_cols) { + const int64_t row = i / num_cols; + in_grad_row_idx[row] = out_grad_row_idx[row]; + KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[row]); + } +}; + +template +void SquareSumRspImpl(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const NDArray& input, + const OpReqType req, + NDArray* output) { + if (req == kNullOp) return; + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; + CHECK(param.axis[0] == 0 || param.axis[0] == 1) + << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; + CHECK_EQ(input.storage_type(), kRowSparseStorage) + << "_square_sum op only supports row-sparse matrix as input"; + int64_t out_data_size = 0; + if (param.axis[0] == 0) { // axis = 0 + CHECK_EQ(output->storage_type(), kDefaultStorage); + out_data_size = input.storage_shape()[1]; + } else if (param.keepdims) { // axis = 1, keepdims = true + CHECK_EQ(output->storage_type(), kRowSparseStorage); + out_data_size = input.storage_shape()[0]; + } else { // axis = 1, keepdims = false + CHECK_EQ(output->storage_type(), kDefaultStorage); + out_data_size = input.shape()[0]; + } + CHECK_NE(req, kWriteInplace); + + using namespace mxnet_op; + if (!input.storage_initialized()) { + if (req == kWriteTo) { + if (output->storage_type() == kDefaultStorage) { + MSHADOW_TYPE_SWITCH(output->data().type_flag_, DType, { + Kernel::Launch(s, out_data_size, output->data().dptr()); + }) + } else if (output->storage_type() == kRowSparseStorage) { + FillZerosRspImpl(s, output); + } else { + LOG(FATAL) << "SquareSumRspImpl only supports row-sparse/dense output storage type"; + } + } + return; + } + + if (output->storage_type() == kRowSparseStorage) { + output->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)}); + } + const TBlob& out_data = output->data(); + const int64_t nnr = input.storage_shape()[0]; + const int64_t num_cols = input.storage_shape()[1]; + const TBlob& in_data = input.data(); + if (0 == param.axis[0]) { // axis = 0, output is dense + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch(s, num_cols, + out_data.dptr(), input.data().dptr(), nnr, num_cols); + }) + }) + } else { // axis = 1 + const TBlob in_row_idx = input.aux_data(rowsparse::kIdx); + if (param.keepdims) { // output is rsp + const TBlob out_row_idx = output->aux_data(rowsparse::kIdx); + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch(s, nnr, + out_row_idx.dptr(), out_data.dptr(), in_row_idx.dptr(), + in_data.dptr(), num_cols); + }) + }) + }) + } else { // output is dense + if (req == kWriteTo) { + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + Kernel::Launch(s, out_data_size, out_data.dptr()); + }) + } + MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch(s, nnr, + out_data.dptr(), in_row_idx.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } + } +} + +template +void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs, + mshadow::Stream* s, + const NDArray& ograd, + const NDArray& input, + const OpReqType req, + NDArray* igrad) { + if (req == kNullOp) return; + const ReduceAxesParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0/1"; + CHECK(param.axis[0] == 0 || param.axis[0] == 1) + << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; + CHECK(ograd.storage_type() == kDefaultStorage || ograd.storage_type() == kRowSparseStorage); + CHECK_EQ(input.storage_type(), kRowSparseStorage); + CHECK_EQ(igrad->storage_type(), kRowSparseStorage); + CHECK_EQ(req, kWriteTo); + if (!input.storage_initialized()) { + FillZerosRspImpl(s, igrad); + return; + } + + using namespace mxnet_op; + // TODO(junwu) change the input of CheckAndAlloc + // if we want to support differen row idx arrays + // for ograd and input when they are both row-sparse ndarrays + igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)}); + const int64_t num_cols = input.storage_shape()[1]; + const TBlob& igrad_data = igrad->data(); + const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx); + const TBlob& ograd_data = ograd.data(); + const TBlob& in_data = input.data(); + const TBlob in_row_idx = input.aux_data(rowsparse::kIdx); + if (ograd.storage_type() == kDefaultStorage) { + if (0 == param.axis[0]) { // forward is sum per column + MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch( + s, igrad_data.Size(), igrad_row_idx.dptr(), + igrad_data.dptr(), ograd_data.dptr(), + in_row_idx.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } else { // forward is sum per row + MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { + MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch( + s, igrad_data.Size(), igrad_row_idx.dptr(), + igrad_data.dptr(), ograd_data.dptr(), + in_row_idx.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } + } else if (ograd.storage_type() == kRowSparseStorage) { + CHECK_EQ(1, param.axis[0]) << "SquareSumRspGradImpl only supports axis = 1" + " when ograd_stype = kRowSparseStorage"; + CHECK_EQ(ograd.shape().ndim(), 2U); + const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx); + CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]); + MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { + if (std::is_same::value) { + const IType* first1 = ograd_row_idx.dptr(); + const IType* last1 = first1 + ograd_row_idx.Size(); + const IType* first2 = in_row_idx.dptr(); + // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp + // ograd_row_idx and in_row_idx are expected to have the same elements + if (ograd_row_idx.Size() == in_row_idx.Size() && in_row_idx.Size() != in_data.shape_[0]) { + CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports" + " equal ograd_row_idx and input_row_idx" + " when ograd and input are both" + " row-sparse"; + } + } else { + LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when" + " ograd and input are both row-sparse"; + } + MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { + MXNET_ASSIGN_REQ_SWITCH(req, req_type, { + Kernel, xpu>::Launch( + s, igrad_data.Size(), igrad_row_idx.dptr(), + igrad_data.dptr(), ograd_row_idx.dptr(), + ograd_data.dptr(), in_data.dptr(), num_cols); + }) + }) + }) + } else { + LOG(FATAL) << "SquareSumRspGradImpl only supports ograd_stype" + << " = kDefaultStorage/kRowSparseStorage"; + } +} + +template +void SquareSumOpForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + mshadow::Stream* s = ctx.get_stream(); + const NDArrayStorageType istype = inputs[0].storage_type(); + if (istype == kRowSparseStorage) { + CHECK_EQ(inputs[0].shape().ndim(), 2U) << "_square_sum op only supports" + " 2D ndarray as input"; + NDArray output = outputs[0]; + SquareSumRspImpl(attrs, s, inputs[0], req[0], &output); + } else { + LOG(FATAL) << "_square_sum op only supports row-sparse ndarray" + " as input, while input stype = " + << istype; + } +} + +template +void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + mshadow::Stream* s = ctx.get_stream(); + const NDArrayStorageType ograd_stype = inputs[0].storage_type(); + const NDArrayStorageType input_stype = inputs[1].storage_type(); + if (input_stype == kRowSparseStorage + && (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage)) { + CHECK_EQ(inputs[1].shape().ndim(), 2U) << "_square_sum op only supports" + " 2D ndarray as input"; + NDArray output = outputs[0]; + SquareSumRspGradImpl(attrs, s, inputs[0], inputs[1], req[0], &output); + } else { + LOG(FATAL) << "_square_sum op backward only supports dense ndarray as ograd," + " row-sparse ndarray as input and row-sparse ndarray as igrad," + " while ograd_stype = " << ograd_stype + << " input_stype = " << input_stype; + } +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc new file mode 100644 index 000000000000..e4b49d7f7fcb --- /dev/null +++ b/src/operator/tensor/square_sum.cc @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file square_sum.cc + * \brief CPU Implementation of square_sum op. + */ +#include "./square_sum-inl.h" + +namespace mxnet { +namespace op { +MXNET_OPERATOR_REGISTER_REDUCE(_square_sum) +.describe(R"code(Computes the square sum of array elements over a given axis +for row-sparse matrix. This is a temporary solution for fusing ops square and +sum together for row-sparse matrix to save memory for storing gradients. +It will become deprecated once the functionality of fusing operators is finished +in the future. + +Example:: + + dns = mx.nd.array([[0, 0], [1, 2], [0, 0], [3, 4], [0, 0]]) + rsp = dns.tostype('row_sparse') + sum = mx.nd._internal._square_sum(rsp, axis=1) + sum = [0, 5, 0, 25, 0] +)code" ADD_FILELINE) +.set_attr("FInferStorageType", SquareSumForwardInferStorageType) +.set_attr("FComputeEx", SquareSumOpForwardEx) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_square_sum"}); + +MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_square_sum) +.set_num_inputs(2) +.set_attr("FInferStorageType", SquareSumBackwardInferStorageType) +.set_attr("FComputeEx", SquareSumOpBackwardEx); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/util/tensor_util-inl.cuh b/src/operator/tensor/util/tensor_util-inl.cuh new file mode 100644 index 000000000000..cf268e7ae9fc --- /dev/null +++ b/src/operator/tensor/util/tensor_util-inl.cuh @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file tensor_util-inl.cuh + * \brief commonly utilized tensor operator GPU kernels + */ +#ifndef MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_ +#define MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_ + +#include +#include +#include + +namespace mxnet { +namespace op { + +/*! + * \brief Thread kernel for marking non-zero rows of a tensor. + * Parallelized by tensor rows: 1 thread/row + */ +struct MarkRspRowThreadKernel { + /*! + * \brief + * \param tid global thread id + * \param row_flg row flag array to mark non-zero rows + * \param dns dense matrix data + * \param num_rows number of rows (size of first dimension of tensor) + * \param row_length number of elements per row + */ + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* row_flg, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + if (tid < num_rows) { + dim_t j = 0; + dim_t offset = tid * row_length; + for (; j < row_length; ++j) { + if (dns[offset+j] != 0) { + break; + } + } + if (j < row_length) { + row_flg[tid] = 1; // mark as one for non-zero row + } else { + row_flg[tid] = 0; // mark as zero for zero row + } + } + } +}; + +/*! + * \brief Warp kernel for marking non-zero rows of a tensor. + * Parallelized by tensor rows: 1 warp/row + */ +struct MarkRspRowWarpKernel { + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* row_flg, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + typedef cub::WarpReduce WarpReduce; + const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32; + __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block]; + + const dim_t warp_id = tid / 32; // global warp id + const dim_t warp_lane = threadIdx.x / 32; // local warp id within thread block + const dim_t lane = tid & (32-1); // local thread id within warp + + if (warp_id < num_rows) { + dim_t flg = 0; + dim_t offset = warp_id * row_length; + for (dim_t j = lane; j < row_length; j+=32) { + if (dns[offset+j] != 0) { + // avoid break: causes slower performance on sparse tensors (<20% density), + // due to thread divergence + flg++; + } + } + dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg); + if (lane == 0) { + if (aggr > 0) { + row_flg[warp_id] = 1; // mark as one for non-zero row + } else { + row_flg[warp_id] = 0; // mark as zero for zero row + } + } + } + } +}; + +/*! + * \brief Block kernel for marking non-zero rows of a tensor. + * Parallelized by tensor rows: 1 threadBlock/row + */ +struct MarkRspRowBlockKernel { + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* row_flg, + const DType* dns, + const nnvm::dim_t num_rows, + const nnvm::dim_t row_length) { + using nnvm::dim_t; + using mshadow::cuda::kBaseThreadNum; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + if (blockIdx.x < num_rows) { + dim_t flg = 0; + dim_t offset = blockIdx.x * row_length; + for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) { + if (dns[offset+j] != 0) { + // avoid break: causes slower performance on sparse tensors (<20% density), + // due to thread divergence + flg++; + } + } + dim_t aggr = BlockReduce(temp_storage).Sum(flg); + if (threadIdx.x == 0) { + if (aggr > 0) { + row_flg[blockIdx.x] = 1; // mark as one for non-zero row + } else { + row_flg[blockIdx.x] = 0; // mark as zero for zero row + } + } + } + } +}; + +/*! + * \brief GPU kernel to flag non-zero rows of an rsp tensor with indices. + * Parallelized by matrix rows: 1 thread/row + */ +struct SetRspRowFlgKernel { + /*! + * \brief + * \param tid global thread id + * \param row_flg array to flag storage indices of non-zero rows + * \param row_idx rsp matrix row index array storing indices of non-zero rows + * \param nnr rsp matrix number of non-zero rows (storage shape) + */ + template + __device__ __forceinline__ static void Map(int tid, + RType* row_flg, + const RType* row_idx, + const nnvm::dim_t nnr) { + if (tid < nnr) { + row_flg[row_idx[tid]] = tid+1; + } + } +}; + +/*! + * \brief GPU kernel for filling the row index array of an rsp tensor. + * Parallelized by tensor rows: 1 thread/row + */ +struct FillRspRowIdxKernel { + /*! + * \brief + * \param tid global thread id + * \param row_idx row index array to store indices of non-zero rows + * \param row_flg_sum inclusive prefix sum array over 0/1 marked row flag array + * \param num_rows rsp tensor number of rows (shape) + */ + template + __device__ __forceinline__ static void Map(int tid, + RType* row_idx, + const nnvm::dim_t* row_flg_sum, + const nnvm::dim_t num_rows) { + if (tid < num_rows) { + nnvm::dim_t prev = (tid == 0)? 0 : row_flg_sum[tid-1]; + if (row_flg_sum[tid] > prev) { + row_idx[prev] = static_cast(tid); + } + } + } +}; + +/*! + * \brief GPU kernel for marking non-zero columns of a csr matrix. + * Parallelized by matrix rows: 1 warp/row + */ +struct MarkCsrColWarpKernel { + /*! + * \brief + * \param tid global thread id + * \param flg flg array to mark non-zero columns + * \param col_idx csr matrix column indices + * \param indptr csr matrix row index pointer + * \param num_rows csr matrix number of rows + * \param num_cols csr matrix number of columns + */ + template + __device__ __forceinline__ static void Map(int tid, + nnvm::dim_t* flg, + const CType* col_idx, + const IType* indptr, + const nnvm::dim_t num_rows, + const nnvm::dim_t num_cols) { + typedef unsigned long long int uint64_cu; + static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof dim_t"); + + const nnvm::dim_t warp_id = tid / 32; // global warp id + const nnvm::dim_t lane = tid & (32-1); // local thread id within warp + + if (warp_id < num_rows) { + uint64_cu zero = 0; + uint64_cu one = 1; + for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) { + atomicCAS(reinterpret_cast(flg+col_idx[j]), zero, one); + } + } + } +}; + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_ diff --git a/tests/ci_build/install/ubuntu_install_python.sh b/tests/ci_build/install/ubuntu_install_python.sh index bb67e3401a89..db4e9c4e0c94 100755 --- a/tests/ci_build/install/ubuntu_install_python.sh +++ b/tests/ci_build/install/ubuntu_install_python.sh @@ -24,5 +24,5 @@ apt-get update && apt-get install -y python-dev python3-dev # the version of the pip shipped with ubuntu may be too lower, install a recent version here cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py -pip2 install nose pylint numpy nose-timer requests h5py -pip3 install nose pylint numpy nose-timer requests h5py +pip2 install nose pylint numpy nose-timer requests h5py scipy +pip3 install nose pylint numpy nose-timer requests h5py scipy diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc index 3fef28f79a0a..cd202ace1686 100644 --- a/tests/cpp/operator/batchnorm_test.cc +++ b/tests/cpp/operator/batchnorm_test.cc @@ -19,7 +19,7 @@ /*! * \file batchnorm_test.cc - * \brief operator unit test utility functions + * \brief batchnorm operator unit test utility functions * \author Chris Olivier */ @@ -892,8 +892,8 @@ TEST(BATCH_NORM, TestIterAll) { kwargs.push_back({ "cudnn_off", "True" }); } for (TShape shape : shapes) { - for (int g1 = 0; g1 < 2U; ++g1) { - for (int g2 = 0; g2 < 2U; ++g2) { + for (int g1 = 0; g1 < 2; ++g1) { + for (int g2 = 0; g2 < 2; ++g2) { for (int type : v2_types) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 3fbf9f910879..af1ecfc5036f 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -22,45 +22,155 @@ sys.path.insert(0, "../../python/") import mxnet as mx import numpy as np +import numpy.random as rnd import time -def check_diff_to_scalar(A, x): +def check_diff_to_scalar(A, x, rank=None): """ assert A == x""" - assert(np.sum(np.abs((A - x).asnumpy())) == 0), A.asnumpy() + assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x) # setup -keys = [3, 5, 7] +keys = ['3', '5', '7'] +rsp_keys = ['9', '11', '13'] + rate = 2 -shape = (2, 2) -big_shape = (1200, 1200) # big than BIGARRAY_BOUND +shape = (2, 3) +big_shape = (1200, 1200) # bigger than BIGARRAY_BOUND -kv = mx.kv.create('dist_sync') +def init_kv(): + kv = mx.kv.create('dist_sync') + # init kv dns keys + kv.init(keys, [mx.nd.ones(shape)] * len(keys)) + kv.init('99', mx.nd.ones(big_shape)) + # init kv row_sparse keys + kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys)) + kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse')) + # worker info + my_rank = kv.rank + nworker = kv.num_workers + # init updater on servers + kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) + return kv, my_rank, nworker -# init kv -kv.init(keys, [mx.nd.ones(shape)] * len(keys)) -kv.init(99, mx.nd.ones(big_shape)) -# init updater on servers -kv.set_optimizer(mx.optimizer.create('test', rate)) +def test_sync_push_pull(): + kv, my_rank, nworker = init_kv() + def check_default_keys(kv, my_rank, nworker): + nrepeat = 3 + for i in range(nrepeat): + kv.push('3', mx.nd.ones(shape)*(my_rank+1)) + kv.push('99', mx.nd.ones(big_shape)*(my_rank+1)) -my_rank = kv.rank -nworker = kv.num_workers + num = (nworker + 1) * nworker * rate / 2 * nrepeat + 1 + val = mx.nd.zeros(shape) + kv.pull('3', out=val) + check_diff_to_scalar(val, num) -def test_sync_push_pull(): - nrepeat = 3 - for i in range(nrepeat): - kv.push(3, mx.nd.ones(shape)*(my_rank+1)) - kv.push(99, mx.nd.ones(big_shape)*(my_rank+1)) - - num = (nworker + 1 ) * nworker * rate / 2 * nrepeat + 1 - val = mx.nd.zeros(shape) - kv.pull(3, out = val) - check_diff_to_scalar(val, num) - # print val.asnumpy() - - val2 = mx.nd.zeros(big_shape) - kv.pull(99, out = val2) - check_diff_to_scalar(val2, num) + val2 = mx.nd.zeros(big_shape) + kv.pull('99', out=val2) + check_diff_to_scalar(val2, num) + + def check_row_sparse_keys(kv, my_rank, nworker): + nrepeat = 3 + # prepare gradient + v = mx.nd.zeros(shape) + my_row = my_rank % shape[0] + v[my_row] = my_rank + 1 + # push + for i in range(nrepeat): + kv.push('9', v.tostype('row_sparse')) + # select a random subset of rows this worker is interested in + num_rows = shape[0] + row_ids_np = np.random.randint(num_rows, size=num_rows) + row_ids = mx.nd.array(row_ids_np, dtype='int64') + # perform pull + val = mx.nd.zeros(shape, stype='row_sparse') + kv.row_sparse_pull('9', out=val, row_ids=row_ids) + # prepare updated values + updated_val = mx.nd.ones(shape) + for rank in range(nworker): + row = rank % shape[0] + updated_val[row] += (rank + 1) * rate * nrepeat + # verify subset of updated values + expected = mx.nd.zeros(shape) + for row in row_ids_np: + expected[row] = updated_val[row] + check_diff_to_scalar(val, expected) + + def check_row_sparse_keys_with_zeros(kv, my_rank, nworker): + nrepeat = 3 + # prepare gradient + v = mx.nd.zeros(shape) + big_v = mx.nd.zeros(big_shape) + # push + for i in range(nrepeat): + kv.push('11', v.tostype('row_sparse')) + kv.push('100', big_v.tostype('row_sparse')) + + # pull a subset of rows this worker is interested in + all_row_ids = np.arange(shape[0]) + val = mx.nd.ones(shape).tostype('row_sparse') + big_val = mx.nd.ones(big_shape).tostype('row_sparse') + kv.row_sparse_pull('11', out=val, row_ids=mx.nd.array(all_row_ids, dtype='int64')) + big_num_rows = shape[0] + big_all_row_ids = np.arange(big_shape[0]) + kv.row_sparse_pull('100', out=big_val, row_ids=mx.nd.array(big_all_row_ids, dtype='int64')) + # verify results + check_diff_to_scalar(val, mx.nd.ones(shape)) + check_diff_to_scalar(big_val, mx.nd.ones(big_shape)) + + def check_big_row_sparse_keys(kv, my_rank, nworker): + mx.random.seed(123) + rnd.seed(123) + density = 0.3 + nrepeat = 3 + # prepare gradient + v = mx.nd.zeros(big_shape) + idx_sample = rnd.rand(big_shape[0]) + indices = np.argwhere(idx_sample < density).flatten() + # each worker chooses a subset of the indices to update + update_rows = [] + for rank in range(nworker): + rows = [] + i = 0 + step = (rank + 1) * 2 + while i < len(indices): + rows.append(indices[i]) + i += step + update_rows.append(np.array(rows)) + # rows to update for this worker + for row in update_rows[my_rank]: + v[row] = my_rank + 1 + # push + for i in range(nrepeat): + kv.push('100', v.tostype('row_sparse')) + + # select a random subset of rows this worker is interested in + mx.random.seed(my_rank) + rnd.seed(my_rank) + num_rows = big_shape[0] + row_ids_np = np.random.randint(num_rows, size=num_rows) + row_ids = mx.nd.array(row_ids_np, dtype='int64') + # perform pull + val = mx.nd.zeros(big_shape, stype='row_sparse') + kv.row_sparse_pull('100', out=val, row_ids=row_ids) + # prepare expected result + updated_val = mx.nd.ones(big_shape) + # apply updates from each worker + for rank in range(nworker): + for row in update_rows[rank]: + updated_val[row] += (rank + 1) * rate * nrepeat + + expected = mx.nd.zeros(big_shape) + for row in row_ids_np: + expected[row] = updated_val[row] + check_diff_to_scalar(val, expected, rank=my_rank) + + check_default_keys(kv, my_rank, nworker) + check_row_sparse_keys(kv, my_rank, nworker) + check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + check_big_row_sparse_keys(kv, my_rank, nworker) + print('worker ' + str(my_rank) + ' is done') if __name__ == "__main__": test_sync_push_pull() diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py new file mode 100644 index 000000000000..ffc0cc1f93e0 --- /dev/null +++ b/tests/python/gpu/test_kvstore_gpu.py @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import mxnet as mx +import numpy as np +from mxnet.test_utils import assert_almost_equal, default_context + +shape = (4, 4) +keys = [5, 7, 11] +str_keys = ['b', 'c', 'd'] + + +def init_kv_with_str(stype='default'): + """init kv """ + kv = mx.kv.create() + # single + kv.init('a', mx.nd.zeros(shape, stype=stype)) + # list + kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys)) + return kv + + +def test_row_sparse_pull(): + kv = init_kv_with_str('row_sparse') + kv.init('e', mx.nd.ones(shape).tostype('row_sparse')) + + def check_row_sparse_pull(kv, count, ctx=default_context()): + num_rows = shape[0] + vals = [] + row_ids = [] + all_row_ids = np.arange(num_rows) + for i in range(count): + vals.append(mx.nd.zeros(shape, ctx=ctx).tostype('row_sparse')) + row_id = np.random.randint(num_rows, size=num_rows) + row_ids.append(mx.nd.array(row_id, dtype='int64')) + row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids + vals_to_pull = vals[0] if len(vals) == 1 else vals + + kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull) + for val, row_id in zip(vals, row_ids): + retained = val.asnumpy() + excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy()) + for row in range(num_rows): + expected_val = np.zeros_like(retained[row]) + expected_val += 0 if row in excluded_row_ids else 1 + assert_almost_equal(retained[row], expected_val) + + check_row_sparse_pull(kv, 1, mx.gpu(0)) + check_row_sparse_pull(kv, 4, mx.gpu(0)) + + +if __name__ == '__main__': + test_row_sparse_pull() diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 81492fe6bbdb..35a20f935573 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -31,6 +31,9 @@ from test_gluon import * #from test_rnn import * from test_gluon_rnn import * +from test_sparse_operator import test_cast_storage_ex, test_sparse_dot +from test_sparse_operator import test_sparse_nd_zeros, test_sparse_retain +from test_sparse_ndarray import test_create_csr, test_create_row_sparse set_default_context(mx.gpu(0)) del test_support_vector_machine_l1_svm diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py index 30dd662ff1cc..37bb5626f765 100644 --- a/tests/python/unittest/test_autograd.py +++ b/tests/python/unittest/test_autograd.py @@ -106,29 +106,41 @@ def autograd_assert(*args, **kwargs): assert same(a.asnumpy(), b.asnumpy()) def test_unary_func(): - x = nd.uniform(shape=(4, 5)) - f_exp = lambda x: nd.exp(x) - f_exp_grad = lambda x: [nd.exp(x)] - autograd_assert(x, func=f_exp, grad_func=f_exp_grad) - f_half = lambda x: x/2 - f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] - autograd_assert(x, func=f_half, grad_func=f_half_grad) - f_square = lambda x: x**2 - f_square_grad = lambda x: [2*x] - autograd_assert(x, func=f_square, grad_func=f_square_grad) + def check_unary_func(x): + f_exp = lambda x: nd.exp(x) + f_exp_grad = lambda x: [nd.exp(x)] + autograd_assert(x, func=f_exp, grad_func=f_exp_grad) + f_half = lambda x: x/2 + f_half_grad = lambda x: [nd.ones(x.shape) * 0.5] + autograd_assert(x, func=f_half, grad_func=f_half_grad) + f_square = lambda x: x**2 + f_square_grad = lambda x: [2*x] + autograd_assert(x, func=f_square, grad_func=f_square_grad) + uniform = nd.uniform(shape=(4, 5)) + stypes = ['row_sparse', 'csr', 'default'] + for stype in stypes: + check_unary_func(uniform.tostype(stype)) def test_binary_func(): - x = nd.uniform(shape=(4, 5)) - y = nd.uniform(shape=(4, 5)) - f_add = lambda x, y: x+y - f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)] - autograd_assert(x, y, func=f_add, grad_func=f_add_grad) - f_mul = lambda x, y: x*y - f_mul_grad = lambda x, y: [y, x] - autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad) - f_compose = lambda x, y: x+x*y - f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x] - autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad) + def check_binary_func(x, y): + f_add = lambda x, y: x+y + f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)] + autograd_assert(x, y, func=f_add, grad_func=f_add_grad) + f_mul = lambda x, y: x*y + f_mul_grad = lambda x, y: [y, x] + autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad) + f_compose = lambda x, y: x+x*y + f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x] + autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad) + uniform_x = nd.uniform(shape=(4, 5)) + uniform_y = nd.uniform(shape=(4, 5)) + stypes = ['row_sparse', 'csr', 'default'] + for stype_x in stypes: + for stype_y in stypes: + x = uniform_x.tostype(stype_x) + y = uniform_y.tostype(stype_y) + check_binary_func(x, y) + def test_operator_with_state(): def f_fc(a, b, weight, bias): @@ -255,14 +267,19 @@ def test_retain_grad(): def test_attach_grad(): - x = mx.nd.zeros((10,)) - assert x.grad is None - x.attach_grad() - with record(): - y = x * 2 - assert y.grad is None - y.backward() - assert (x.grad.asnumpy() == 2).all() + def check_attach_grad(x): + assert x.grad is None + x.attach_grad() + with record(): + y = x * 2 + assert y.grad is None + y.backward() + assert (x.grad.asnumpy() == 2).all() + zeros = mx.nd.zeros((10, 10)) + stypes = ['default', 'row_sparse', 'csr'] + for stype in stypes: + x = zeros.tostype(stype) + check_attach_grad(x) def test_is_train(): diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py index d7f52e216659..73654a604135 100644 --- a/tests/python/unittest/test_infer_shape.py +++ b/tests/python/unittest/test_infer_shape.py @@ -52,7 +52,7 @@ def test_backward_infer(): # broadcast add here, not being able to deduce shape correctly wt = mx.sym.broadcast_add(w, wshift) # shape constraint, this is what enables backward shape inference - wt = mx._symbol_internal._identity_with_attr_like_rhs(wt, w) + wt = mx.symbol._internal._identity_with_attr_like_rhs(wt, w) net = mx.sym.FullyConnected(data=data, weight=wt, num_hidden=11, no_bias=True) data_shape = (7, 100) arg_shapes, out_shapes, aux_shapes = net.infer_shape(data=data_shape) @@ -129,6 +129,24 @@ def test_incomplete_infer_concat(): assert arg_shapes['b'] == (2, 5) assert arg_shapes['d'] == (2, 15) +def test_fc_infer_type(): + mx_real_t = mx.base.mx_real_t + data = mx.symbol.Variable('data') + out = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=1000) + + # infer type + data_type = mx_real_t + arg_types, out_types, aux_types = out.infer_type(data=data_type) + arg_type_dict = dict(zip(out.list_arguments(), arg_types)) + assert len(out_types) == 1 + assert out_types[0] == mx_real_t + true_types = { + 'fc1_bias' : mx_real_t, + 'fc1_weight' : mx_real_t } + for k, v in true_types.items(): + assert arg_type_dict[k] == v + + if __name__ == "__main__": test_mlp2_infer_shape() test_mlp2_infer_error() diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py index c0f2acd4ed47..a543463f3663 100644 --- a/tests/python/unittest/test_io.py +++ b/tests/python/unittest/test_io.py @@ -17,6 +17,7 @@ # pylint: skip-file import mxnet as mx +from mxnet.test_utils import * import numpy as np import os, gzip import pickle as pickle @@ -152,6 +153,109 @@ def test_NDArrayIter_h5py(): else: assert(labelcount[i] == 100) +def test_NDArrayIter_csr(): + import scipy.sparse as sp + # creating toy data + num_rows = rnd.randint(5, 15) + num_cols = rnd.randint(1, 20) + batch_size = rnd.randint(1, num_rows) + shape = (num_rows, num_cols) + csr, _ = rand_sparse_ndarray(shape, 'csr') + dns = csr.asnumpy() + + # make iterators + csr_iter = iter(mx.io.NDArrayIter(csr, csr, batch_size)) + begin = 0 + for batch in csr_iter: + expected = np.zeros((batch_size, num_cols)) + end = begin + batch_size + expected[:num_rows - begin] = dns[begin:end] + if end > num_rows: + expected[num_rows - begin:] = dns[0:end - num_rows] + assert_almost_equal(batch.data[0].asnumpy(), expected) + begin += batch_size + +def test_LibSVMIter(): + def get_data(data_dir, data_name, url, data_origin_name): + if not os.path.isdir(data_dir): + os.system("mkdir " + data_dir) + os.chdir(data_dir) + if (not os.path.exists(data_name)): + if sys.version_info[0] >= 3: + from urllib.request import urlretrieve + else: + from urllib import urlretrieve + zippath = os.path.join(data_dir, data_origin_name) + urlretrieve(url, zippath) + import bz2 + bz_file = bz2.BZ2File(data_origin_name, 'rb') + with open(data_name, 'wb') as fout: + try: + content = bz_file.read() + fout.write(content) + finally: + bz_file.close() + os.chdir("..") + + def check_libSVMIter_synthetic(): + cwd = os.getcwd() + data_path = os.path.join(cwd, 'data.t') + label_path = os.path.join(cwd, 'label.t') + with open(data_path, 'w') as fout: + fout.write('1.0 0:0.5 2:1.2\n') + fout.write('-2.0\n') + fout.write('-3.0 0:0.6 1:2.4 2:1.2\n') + fout.write('4 2:-1.2\n') + + with open(label_path, 'w') as fout: + fout.write('1.0\n') + fout.write('-2.0 0:0.125\n') + fout.write('-3.0 2:1.2\n') + fout.write('4 1:1.0 2:-1.2\n') + + data_dir = os.path.join(cwd, 'data') + data_train = mx.io.LibSVMIter(data_libsvm=data_path, label_libsvm=label_path, + data_shape=(3, ), label_shape=(3, ), batch_size=3) + + first = mx.nd.array([[ 0.5, 0., 1.2], [ 0., 0., 0.], [ 0.6, 2.4, 1.2]]) + second = mx.nd.array([[ 0., 0., -1.2], [ 0.5, 0., 1.2], [ 0., 0., 0.]]) + i = 0 + for batch in iter(data_train): + expected = first.asnumpy() if i == 0 else second.asnumpy() + assert_almost_equal(data_train.getdata().asnumpy(), expected) + i += 1 + + def check_libSVMIter_news_data(): + news_metadata = { + 'name': 'news20.t', + 'origin_name': 'news20.t.bz2', + 'url': "http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.bz2", + 'feature_dim': 62060, + 'num_classes': 20, + 'num_examples': 3993, + } + num_parts = 3 + batch_size = 128 + num_examples = news_metadata['num_examples'] + data_dir = os.path.join(os.getcwd(), 'data') + get_data(data_dir, news_metadata['name'], news_metadata['url'], + news_metadata['origin_name']) + path = os.path.join(data_dir, news_metadata['name']) + data_train = mx.io.LibSVMIter(data_libsvm=path, data_shape=(news_metadata['feature_dim'],), + batch_size=batch_size, num_parts=num_parts, part_index=0) + num_batches = 0 + iterator = iter(data_train) + for batch in iterator: + # check the range of labels + assert(np.sum(batch.label[0].asnumpy() > 20) == 0) + assert(np.sum(batch.label[0].asnumpy() <= 0) == 0) + num_batches += 1 + import math + expected_num_batches = math.ceil(num_examples * 1.0 / batch_size / num_parts) + assert(num_batches == int(expected_num_batches)), (num_batches, expected_num_batches) + + check_libSVMIter_synthetic() + check_libSVMIter_news_data() if __name__ == "__main__": test_NDArrayIter() @@ -159,3 +263,5 @@ def test_NDArrayIter_h5py(): test_NDArrayIter_h5py() test_MNISTIter() test_Cifar10Rec() + test_LibSVMIter() + test_NDArrayIter_csr() diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py index f1e10c757fad..a43b98a635fb 100644 --- a/tests/python/unittest/test_kvstore.py +++ b/tests/python/unittest/test_kvstore.py @@ -18,44 +18,74 @@ # pylint: skip-file import mxnet as mx import numpy as np +from mxnet.test_utils import rand_ndarray, assert_almost_equal shape = (4, 4) keys = [5, 7, 11] str_keys = ['b', 'c', 'd'] -def init_kv(): +def init_kv(stype='default'): """init kv """ kv = mx.kv.create() # single - kv.init(3, mx.nd.zeros(shape)) + kv.init(3, mx.nd.zeros(shape=shape, stype=stype)) # list - kv.init(keys, [mx.nd.zeros(shape)] * len(keys)) + kv.init(keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys)) return kv -def init_kv_with_str(): +def init_kv_with_str(stype='default'): """init kv """ kv = mx.kv.create() # single - kv.init('a', mx.nd.zeros(shape)) + kv.init('a', mx.nd.zeros(shape, stype=stype)) # list - kv.init(str_keys, [mx.nd.zeros(shape)] * len(keys)) + kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys)) return kv def check_diff_to_scalar(A, x): """ assert A == x""" assert(np.sum(np.abs((A - x).asnumpy())) == 0) + def test_single_kv_pair(): """single key-value pair push & pull""" def check_single_kv_pair(kv, key): kv.push(key, mx.nd.ones(shape)) val = mx.nd.empty(shape) - kv.pull(key, out = val) + kv.pull(key, out=val) check_diff_to_scalar(val, 1) check_single_kv_pair(init_kv(), 3) check_single_kv_pair(init_kv_with_str(), 'a') +def test_row_sparse_pull(): + kv = init_kv_with_str('row_sparse') + kv.init('e', mx.nd.ones(shape).tostype('row_sparse')) + + def check_row_sparse_pull(kv, count): + num_rows = shape[0] + vals = [] + row_ids = [] + all_row_ids = np.arange(num_rows) + for i in range(count): + vals.append(mx.nd.zeros(shape).tostype('row_sparse')) + row_id = np.random.randint(num_rows, size=num_rows) + row_ids.append(mx.nd.array(row_id, dtype='int64')) + row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids + vals_to_pull = vals[0] if len(vals) == 1 else vals + + kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull) + for val, row_id in zip(vals, row_ids): + retained = val.asnumpy() + excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy()) + for row in range(num_rows): + expected_val = np.zeros_like(retained[row]) + expected_val += 0 if row in excluded_row_ids else 1 + assert_almost_equal(retained[row], expected_val) + + check_row_sparse_pull(kv, 1) + check_row_sparse_pull(kv, 4) + def test_init(): """test init""" def check_init(kv, key): @@ -72,7 +102,7 @@ def test_list_kv_pair(): def check_list_kv_pair(kv, key): kv.push(key, [mx.nd.ones(shape)*4] * len(key)) val = [mx.nd.empty(shape)] * len(key) - kv.pull(key, out = val) + kv.pull(key, out=val) for v in val: check_diff_to_scalar(v, 4) @@ -92,7 +122,7 @@ def check_aggregator(kv, key, key_list): vals = [mx.nd.ones(shape, d) for d in devs] kv.push(key, vals) - kv.pull(key, out = vals) + kv.pull(key, out=vals) for v in vals: check_diff_to_scalar(v, num_devs) @@ -100,7 +130,7 @@ def check_aggregator(kv, key, key_list): # list vals = [[mx.nd.ones(shape, d)*2.0 for d in devs]] * len(key_list) kv.push(key_list, vals) - kv.pull(key_list, out = vals) + kv.pull(key_list, out=vals) for vv in vals: for v in vv: @@ -110,10 +140,50 @@ def check_aggregator(kv, key, key_list): check_aggregator(init_kv_with_str(), 'a', str_keys) +def test_sparse_aggregator(): + """aggregate sparse ndarray on muliple devices""" + + stype = 'row_sparse' + kv = init_kv_with_str(stype) + + # devices + num_devs = 4 + devs = [mx.Context('cpu', i) for i in range(num_devs)] + + # single + vals = [rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)] + expected_sum = np.zeros(shape) + for v in vals: + expected_sum += v.asnumpy() + + # prepare row_ids + all_rows = mx.nd.array(np.arange(shape[0]), dtype='int64') + kv.push('a', vals) + kv.row_sparse_pull('a', out=vals, row_ids=[all_rows] * len(vals)) + result_sum = np.zeros(shape) + for v in vals: + result_sum += v.asnumpy() + assert_almost_equal(result_sum, expected_sum * num_devs) + + # list + vals = [[rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)]] * len(keys) + expected_sum = np.zeros(shape) + for v in vals[0]: + expected_sum += v.asnumpy() + + kv.push(str_keys, vals) + kv.row_sparse_pull(str_keys, out=vals, row_ids=[[all_rows] * num_devs] * len(vals)) + for vv in vals: + result_sum = np.zeros(shape) + for v in vv: + result_sum += v.asnumpy() + assert_almost_equal(result_sum, expected_sum * num_devs) + def updater(key, recv, local): """use updater: +=""" local += recv + def test_updater(dev = 'cpu'): """updater""" @@ -126,7 +196,7 @@ def check_updater(kv, key, key_list): vals = [mx.nd.ones(shape, d) for d in devs] kv.push(key, vals) - kv.pull(key, out = vals) + kv.pull(key, out=vals) for v in vals: check_diff_to_scalar(v, num_devs) @@ -138,7 +208,7 @@ def check_updater(kv, key, key_list): for i in range(num_push): kv.push(key_list, vals) - kv.pull(key_list, out = vals) + kv.pull(key_list, out=vals) for vv in vals: for v in vv: @@ -152,16 +222,54 @@ def check_updater(kv, key, key_list): str_kv._set_updater(updater) check_updater(str_kv, 'a', str_keys) - def test_get_type(): kvtype = 'local_allreduce_cpu' kv = mx.kv.create(kvtype) assert kv.type == kvtype +def test_invalid_pull(): + def check_invalid_single_kv_pair(kv, key): + dns_val = mx.nd.ones(shape) * 2 + rsp_val = dns_val.tostype('row_sparse') + kv.pull(key, out=rsp_val) + # pull should be ignored with no values updated + check_diff_to_scalar(rsp_val, 2) + try: + # row_sparse_pull should be aborted when vals.stype != row_sparse + kv.row_sparse_pull(key, out=dns_val, rowids=mx.nd.array([1])) + assert(False) + except: + pass + + def check_invalid_list_kv_pair(kv, key): + dns_val = [mx.nd.ones(shape) * 2] * len(key) + rsp_val = [val.tostype('row_sparse') for val in dns_val] + kv.pull(key, out=rsp_val) + for v in rsp_val: + # pull should be ignored with no values updated + check_diff_to_scalar(v, 2) + try: + # row_sparse_pull should be aborted when vals.stype != row_sparse + kv.row_sparse_pull(key, out=dns_val, rowids=[mx.nd.array([1])] * len(key)) + assert(False) + except: + pass + + int_kv = init_kv() + str_kv = init_kv_with_str() + + check_invalid_single_kv_pair(int_kv, 3) + check_invalid_single_kv_pair(str_kv, 'a') + + check_invalid_list_kv_pair(int_kv, keys) + check_invalid_list_kv_pair(str_kv, str_keys) + if __name__ == '__main__': test_init() test_get_type() test_single_kv_pair() test_list_kv_pair() + test_sparse_aggregator() test_aggregator() test_updater() + test_row_sparse_pull() diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index f522f29dae39..9d8d76f5aa92 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -17,12 +17,15 @@ import mxnet as mx import mxnet.ndarray as nd +from mxnet.test_utils import * import numpy as np from functools import reduce from mxnet.module.executor_group import DataParallelExecutorGroup from common import assertRaises from collections import namedtuple +import numpy.random as rnd + def test_module_dtype(): dtype = np.float16 @@ -345,7 +348,6 @@ def mean_abs(x): break assert(mon_result_counts == [2, 2, 1, 6, 6, 4]) - def test_executor_group(): def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len): stack = mx.rnn.SequentialRNNCell() @@ -458,6 +460,107 @@ def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=N shared_arg_names=shared_arg_names, extra_args=extra_args) +def test_factorization_machine_module(): + """ Test factorization machine model with sparse operators """ + mx.random.seed(11) + rnd.seed(11) + + def fm(factor_size, feature_dim, init): + x = mx.symbol.Variable("data", stype='csr') + v = mx.symbol.Variable("v", shape=(feature_dim, factor_size), + init=init, stype='row_sparse') + + w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1), + init=init, stype='row_sparse') + w1_bias = mx.symbol.var('w1_bias', shape=(1)) + w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias) + + v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True) + x_s = mx.symbol.square(data=x) + bd_sum = mx.sym.dot(x_s, v_s) + + w2 = mx.symbol.dot(x, v) + w2_squared = 0.5 * mx.symbol.square(data=w2) + + w_all = mx.symbol.Concat(w1, w2_squared, dim=1) + sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True) + sum2 = 0.5 * mx.symbol.negative(bd_sum) + model = mx.sym.elemwise_add(sum1, sum2) + + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y) + return model + + # model + ctx = default_context() + init = mx.initializer.Normal(sigma=0.01) + factor_size = 4 + feature_dim = 10000 + model = fm(factor_size, feature_dim, init) + + # data iter + num_batches = 5 + batch_size = 64 + num_samples = batch_size * num_batches + import scipy.sparse as sp + # generate some random scipy csr data + csr_sp = sp.rand(num_samples, feature_dim, density=0.1, format='csr') + csr_nd = mx.nd.sparse.csr_matrix(csr_sp.data, csr_sp.indptr, csr_sp.indices, + (num_samples, feature_dim)) + label = mx.nd.ones((num_samples,1)) + # the alternative is to use LibSVMIter + train_iter = mx.io.NDArrayIter(data=csr_nd, + label={'label':label}, + batch_size=batch_size) + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + # allocate memory by given the input data and lable shapes + mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + # initialize parameters by uniform random numbers + mod.init_params(initializer=init) + # use Sparse SGD with learning rate 0.1 to train + adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.001, rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=adam) + # use accuracy as the metric + metric = mx.metric.create('MSE') + # train 10 epoch + for epoch in range(10): + train_iter.reset() + metric.reset() + for batch in train_iter: + mod.forward(batch, is_train=True) # compute predictions + mod.update_metric(metric, batch.label) # accumulate prediction accuracy + mod.backward() # compute gradients + mod.update() # update parameters + # print('Epoch %d, Training %s' % (epoch, metric.get())) + assert(metric.get()[1] < 0.05), metric.get()[1] + + +def test_module_initializer(): + def regression_model(m): + x = mx.symbol.var("data", stype='csr') + v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), + stype='row_sparse') + model = mx.symbol.dot(lhs=x, rhs=v) + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") + return model + + n, m = 128, 100 + model = regression_model(m) + + data = mx.nd.zeros(shape=(n, m), stype='csr') + label = mx.nd.zeros((n, 1)) + iterator = mx.io.NDArrayIter(data=data, label={'label':label}, batch_size=n) + + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) + mod.init_params() + v = mod._arg_params['v'] + assert(v.stype == 'row_sparse') + assert(np.sum(v.asnumpy()) != 0) + def test_forward_reshape(): num_class=10 data1 = mx.sym.Variable('data1') diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py index 6f8eb17ff34e..0a2739d9bb4e 100644 --- a/tests/python/unittest/test_multi_device_exec.py +++ b/tests/python/unittest/test_multi_device_exec.py @@ -16,6 +16,7 @@ # under the License. import os +import numpy as np import mxnet as mx def test_ctx_group(): @@ -49,5 +50,31 @@ def test_ctx_group(): else: assert arr.context == group2ctx['stage2'] +def test_ctx_group_sparse(): + with mx.AttrScope(ctx_group='stage1'): + lhs = mx.symbol.Variable('lhs', stype='csr') + rhs = mx.symbol.Variable('rhs', stype='row_sparse') + dot = mx.symbol.dot(lhs, rhs, name='dot') + + set_stage1 = set(dot.list_arguments()) + with mx.AttrScope(ctx_group='stage2'): + softmax = mx.symbol.SoftmaxOutput(data = dot, name = 'softmax') + + set_stage2 = set(softmax.list_arguments()) - set_stage1 + + group2ctx = { + 'stage1' : mx.cpu(1), + 'stage2' : mx.cpu(2) + } + texec = softmax.simple_bind(mx.cpu(0), group2ctx=group2ctx, + lhs=(32,200), rhs=(200, 5)) + + for arr, name in zip(texec.arg_arrays, softmax.list_arguments()): + if name in set_stage1: + assert arr.context == group2ctx['stage1'] + else: + assert arr.context == group2ctx['stage2'] + if __name__ == '__main__': test_ctx_group() + test_ctx_group_sparse() diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index eae364eeaecf..3e0ac66c168d 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -373,6 +373,7 @@ def test_dot(): assert_almost_equal(c, C.asnumpy()) + def test_reduce(): sample_num = 200 def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes): diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index a33cb039c849..11d0ea22319a 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -855,75 +855,88 @@ def test_nearest_upsampling(): check_nearest_upsampling_with_shape(shapes, scale, root_scale) def test_batchnorm_training(): - for shape in [(2, 3), (2, 3, 2, 2)]: - data_tmp = np.random.normal(-0.1, 0.1, size=shape) - s = shape[1], - gamma = np.ones(s) - beta = np.ones(s) - gamma[1] = 3 - beta[0] = 3 + def check_batchnorm_training(stype): + for shape in [(2, 3), (2, 3, 2, 2)]: + data_tmp = np.random.normal(-0.1, 0.1, size=shape) + s = shape[1], + gamma = np.ones(s) + beta = np.ones(s) + gamma[1] = 3 + beta[0] = 3 - rolling_mean = np.random.uniform(size=s) - rolling_std = np.random.uniform(size=s) + rolling_mean = np.random.uniform(size=s) + rolling_std = np.random.uniform(size=s) - data = mx.symbol.Variable('data') + data = mx.symbol.Variable('data', stype=stype) + in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype), + mx.nd.array(beta).tostype(stype)] + mean_std = [mx.nd.array(rolling_mean).tostype(stype), mx.nd.array(rolling_std).tostype(stype)] - test = mx.symbol.BatchNorm_v1(data, fix_gamma=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm_v1(data, fix_gamma=False) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=False) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=False) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=False) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True) - check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16) + test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True) + check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16) - # Test varying channel axis - dim = len(shape) - for chaxis in range(-dim, dim): - chaxis_true = chaxis - if chaxis < 0: - chaxis_true = dim + chaxis + # Test varying channel axis + dim = len(shape) + for chaxis in range(-dim, dim): + chaxis_true = chaxis + if chaxis < 0: + chaxis_true = dim + chaxis - shapex = shape + shapex = shape - channel_count = shapex[chaxis_true] - data_tmp = np.random.normal(-0.1, 0.1, size=shapex) + channel_count = shapex[chaxis_true] + data_tmp = np.random.normal(-0.1, 0.1, size=shapex) - gamma = np.ones(channel_count) - beta = np.ones(channel_count) - if channel_count > 1: - gamma[1] = 3 - beta[0] = 3 + gamma = np.ones(channel_count) + beta = np.ones(channel_count) + if channel_count > 1: + gamma[1] = 3 + beta[0] = 3 + + in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype), + mx.nd.array(beta).tostype(stype)] + + xrolling_mean = np.random.uniform(size=channel_count) + xrolling_std = np.random.uniform(size=channel_count) + xmean_std = [mx.nd.array(xrolling_mean).tostype(stype), + mx.nd.array(xrolling_std).tostype(stype)] - xrolling_mean = np.random.uniform(size=channel_count) - xrolling_std = np.random.uniform(size=channel_count) + test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis) + check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01) - test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis) - check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01) + stypes = ['row_sparse', 'default'] + for stype in stypes: + check_batchnorm_training(stype) def test_convolution_grouping(): num_filter = 4 diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 3b3b92b372d8..055f6464f0ef 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -47,26 +47,43 @@ def test_lr_wd_mult(): assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1) -def compare_optimizer(opt1, opt2, shape, dtype): - w1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) - g1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) - - w2 = w1.copyto(default_context()) - g2 = g1.copyto(default_context()) +def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default'): + if w_stype == 'default': + w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) + w1 = w2.copyto(default_context()) + elif w_stype == 'row_sparse' or w_stype == 'csr': + w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype) + w1 = w2.copyto(default_context()).tostype('default') + else: + raise Exception("type not supported yet") + if g_stype == 'default': + g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype) + g1 = g2.copyto(default_context()) + elif g_stype == 'row_sparse' or g_stype == 'csr': + g2 = rand_ndarray(shape, g_stype, dtype=dtype) + g1 = g2.copyto(default_context()).tostype('default') + else: + raise Exception("type not supported yet") state1 = opt1.create_state(0, w1) state2 = opt2.create_state(0, w2) if state1 is not None and state2 is not None: - for s1, s2, in zip(state1, state2): - if s1 is not None or s2 is not None: - assert(same(s1.asnumpy(), s2.asnumpy())) + if isinstance(state1, tuple): + for s1, s2, in zip(state1, state2): + if s1 is not None or s2 is not None: + assert(same(s1.asnumpy(), s2.asnumpy())) + else: + assert_almost_equal(state1.asnumpy(), state2.asnumpy()) opt1.update(0, w1, g1, state1) opt2.update(0, w2, g2, state2) if state1 is not None and state2 is not None: - for s1, s2, in zip(state1, state2): - if s1 is not None or s2 is not None: - assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5) + if isinstance(state1, tuple): + for s1, s2, in zip(state1, state2): + if s1 is not None or s2 is not None: + assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5) + else: + assert_almost_equal(state1.asnumpy(), state2.asnumpy()) assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=1e-4, atol=1e-5) # SGD @@ -186,18 +203,122 @@ def test_sgd(): not kwarg['multi_precision'])): continue compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype) + # test operator fallback on cpu + if (default_context() == mx.cpu()): + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + g_stype='row_sparse') + if dtype != np.float16: + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2], + dtype, w_stype='csr', g_stype='csr') + +class PySparseSGD(mx.optimizer.Optimizer): + """python reference implemenation of sgd""" + def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs): + super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs) + self.momentum = momentum + + def create_state(self, index, weight): + """Create additional optimizer state: momentum + + Parameters + ---------- + weight : NDArray + The weight data + + """ + if self.momentum == 0.0: + return None + else: + return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype) + + def update(self, index, weight, grad, state): + """Update the parameters. + + Parameters + ---------- + index : int + An unique integer key used to index the parameters + + weight : NDArray + weight ndarray + + grad : NDArray + grad ndarray + + state : NDArray or other objects returned by init_state + The auxiliary state used in optimization. + """ + lr = self._get_lr(index) + wd = self._get_wd(index) + self._update_count(index) + num_rows = weight.shape[0] + if self.momentum == 0.0: + # Update on a per row basis, skip all-zero rows + for row in range(num_rows): + grad_row = grad[row].asnumpy() + all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row)) + if all_zeros: + continue + if self.clip_gradient is not None: + weight[row] = ((1 - lr*wd)*weight[row] - + lr*mx.nd.clip(grad[row]*self.rescale_grad, + -self.clip_gradient, self.clip_gradient)) + else: + weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row] + else: + mom = state + for row in range(num_rows): + grad_row = grad[row].asnumpy() + all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row)) + if all_zeros: + continue + if self.clip_gradient is not None: + mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] - + lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient)) + weight[row] += mom[row] + else: + mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row] + weight[row] += mom[row] + +def test_sparse_sgd(): + mx.random.seed(0) + opt1 = PySparseSGD + opt2 = mx.optimizer.SGD + shape = (3, 4, 5) + mom_options = [{}, {'momentum': 0.9}] + cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] + rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] + wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] + mp_options = [{}] + for dtype in [np.float32]: + for mom_option in mom_options: + for cg_option in cg_options: + for rg_option in rg_options: + for wd_option in wd_options: + for mp_option in mp_options: + kwarg = {} + kwarg.update(mom_option) + kwarg.update(cg_option) + kwarg.update(rg_option) + kwarg.update(wd_option) + kwarg.update(mp_option) + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + w_stype='row_sparse', g_stype='row_sparse') + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + w_stype='row_sparse', g_stype='default') # ADAM class PyAdam(mx.optimizer.Optimizer): """python reference implemenation of adam""" def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, - decay_factor=(1 - 1e-8), **kwargs): + decay_factor=(1 - 1e-8), sparse_update=False, **kwargs): super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs) self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.decay_factor = decay_factor + self.sparse_update = sparse_update def create_state(self, index, weight): """Create additional optimizer state: mean, variance @@ -235,21 +356,28 @@ def update(self, index, weight, grad, state): mean, variance = state wd = self._get_wd(index) - grad = grad * self.rescale_grad + wd * weight - if self.clip_gradient is not None: - mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad) - - mean *= self.beta1 - mean += grad * (1. - self.beta1) - - variance *= self.beta2 - variance += (1 - self.beta2) * mx.nd.square(grad, out=grad) - + num_rows = weight.shape[0] coef1 = 1. - self.beta1**t coef2 = 1. - self.beta2**t lr *= math.sqrt(coef2)/coef1 - - weight -= lr*mean/(mx.nd.sqrt(variance) + self.epsilon) + for row in range(num_rows): + # check row slices of all zeros + all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy())) + # skip zeros during sparse update + if all_zeros and self.sparse_update: + continue + grad[row] = grad[row] * self.rescale_grad + wd * weight[row] + # clip gradients + if self.clip_gradient is not None: + mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row]) + # update mean + mean[row] *= self.beta1 + mean[row] += grad[row] * (1. - self.beta1) + # update variance + variance[row] *= self.beta2 + variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row]) + # update weight + weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon) def test_adam(): @@ -266,6 +394,8 @@ def test_adam(): {'rescale_grad': 0.8, 'wd': 0.05}] for kwarg in kwargs: compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32) + compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape, + np.float32, w_stype='row_sparse', g_stype='row_sparse') # RMSProp class PyRMSProp(mx.optimizer.Optimizer): @@ -406,8 +536,10 @@ def test_rms(): {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True, 'clip_weights': 0.01}] for kwarg in kwargs: compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32) + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32, g_stype='row_sparse') if __name__ == '__main__': test_adam() test_rms() test_sgd() + test_sparse_sgd() diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py new file mode 100644 index 000000000000..a77343436945 --- /dev/null +++ b/tests/python/unittest/test_sparse_ndarray.py @@ -0,0 +1,524 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pickle as pkl + +from mxnet.ndarray import NDArray +from mxnet.test_utils import * +from numpy.testing import assert_allclose +import numpy.random as rnd + +from mxnet.ndarray.sparse import RowSparseNDArray, CSRNDArray + + +def assert_fcompex(f, *args, **kwargs): + prev_val = mx.test_utils.set_env_var("MXNET_EXEC_STORAGE_FALLBACK", "0", "1") + f(*args, **kwargs) + mx.test_utils.set_env_var("MXNET_EXEC_STORAGE_FALLBACK", prev_val) + + +def sparse_nd_ones(shape, stype): + return mx.nd.ones(shape).tostype(stype) + + +def check_sparse_nd_elemwise_binary(shapes, stypes, f, g): + # generate inputs + nds = [] + for i, stype in enumerate(stypes): + if stype == 'row_sparse': + nd, _ = rand_sparse_ndarray(shapes[i], stype) + elif stype == 'default': + nd = mx.nd.array(random_arrays(shapes[i]), dtype = np.float32) + else: + assert(False) + nds.append(nd) + # check result + test = f(nds[0], nds[1]) + assert_almost_equal(test.asnumpy(), g(nds[0].asnumpy(), nds[1].asnumpy())) + + +def test_sparse_nd_elemwise_add(): + num_repeats = 10 + g = lambda x,y: x + y + op = mx.nd.elemwise_add + for i in range(num_repeats): + shape = [rand_shape_2d()] * 2 + assert_fcompex(check_sparse_nd_elemwise_binary, + shape, ['default'] * 2, op, g) + assert_fcompex(check_sparse_nd_elemwise_binary, + shape, ['default', 'row_sparse'], op, g) + assert_fcompex(check_sparse_nd_elemwise_binary, + shape, ['row_sparse', 'row_sparse'], op, g) + + +def test_sparse_nd_copy(): + def check_sparse_nd_copy(from_stype, to_stype, shape): + from_nd = rand_ndarray(shape, from_stype) + # copy to ctx + to_ctx = from_nd.copyto(default_context()) + # copy to stype + to_nd = rand_ndarray(shape, to_stype) + to_nd = from_nd.copyto(to_nd) + assert np.sum(np.abs(from_nd.asnumpy() != to_ctx.asnumpy())) == 0.0 + assert np.sum(np.abs(from_nd.asnumpy() != to_nd.asnumpy())) == 0.0 + + shape = rand_shape_2d() + shape_3d = rand_shape_3d() + stypes = ['row_sparse', 'csr'] + for stype in stypes: + check_sparse_nd_copy(stype, 'default', shape) + check_sparse_nd_copy('default', stype, shape) + check_sparse_nd_copy('row_sparse', 'row_sparse', shape_3d) + check_sparse_nd_copy('row_sparse', 'default', shape_3d) + check_sparse_nd_copy('default', 'row_sparse', shape_3d) + +def test_sparse_nd_basic(): + def check_sparse_nd_basic_rsp(): + storage_type = 'row_sparse' + shape = rand_shape_2d() + nd, (v, idx) = rand_sparse_ndarray(shape, storage_type) + assert(nd._num_aux == 1) + assert(nd.indices.dtype == np.int64) + assert(nd.stype == 'row_sparse') + + check_sparse_nd_basic_rsp() + + +def test_sparse_nd_setitem(): + def check_sparse_nd_setitem(stype, shape, dst): + x = mx.nd.zeros(shape=shape, stype=stype) + x[:] = dst + dst_nd = mx.nd.array(dst) if isinstance(dst, (np.ndarray, np.generic)) else dst + assert same(x.asnumpy(), dst_nd.asnumpy()) + + shape = rand_shape_2d() + for stype in ['row_sparse', 'csr']: + # ndarray assignment + check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, 'default')) + check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, stype)) + # numpy assignment + check_sparse_nd_setitem(stype, shape, np.ones(shape)) + + +def test_sparse_nd_slice(): + def check_sparse_nd_csr_slice(shape): + stype = 'csr' + A, _ = rand_sparse_ndarray(shape, stype) + A2 = A.asnumpy() + start = rnd.randint(0, shape[0] - 1) + end = rnd.randint(start + 1, shape[0]) + assert same(A[start:end].asnumpy(), A2[start:end]) + assert same(A[start:].asnumpy(), A2[start:]) + assert same(A[:end].asnumpy(), A2[:end]) + + shape = (rnd.randint(2, 10), rnd.randint(1, 10)) + check_sparse_nd_csr_slice(shape) + + +def test_sparse_nd_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x == y + assert (z.asnumpy() == np.zeros(shape)).all() + z = 0 == x + assert (z.asnumpy() == np.ones(shape)).all() + + +def test_sparse_nd_not_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x != y + assert (z.asnumpy() == np.ones(shape)).all() + z = 0 != x + assert (z.asnumpy() == np.zeros(shape)).all() + + +def test_sparse_nd_greater(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x > y + assert (z.asnumpy() == np.zeros(shape)).all() + z = y > 0 + assert (z.asnumpy() == np.ones(shape)).all() + z = 0 > y + assert (z.asnumpy() == np.zeros(shape)).all() + + +def test_sparse_nd_greater_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = x >= y + assert (z.asnumpy() == np.zeros(shape)).all() + z = y >= 0 + assert (z.asnumpy() == np.ones(shape)).all() + z = 0 >= y + assert (z.asnumpy() == np.zeros(shape)).all() + z = y >= 1 + assert (z.asnumpy() == np.ones(shape)).all() + + +def test_sparse_nd_lesser(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = y < x + assert (z.asnumpy() == np.zeros(shape)).all() + z = 0 < y + assert (z.asnumpy() == np.ones(shape)).all() + z = y < 0 + assert (z.asnumpy() == np.zeros(shape)).all() + + +def test_sparse_nd_lesser_equal(): + for stype in ['row_sparse', 'csr']: + shape = rand_shape_2d() + x = mx.nd.zeros(shape=shape, stype=stype) + y = sparse_nd_ones(shape, stype) + z = y <= x + assert (z.asnumpy() == np.zeros(shape)).all() + z = 0 <= y + assert (z.asnumpy() == np.ones(shape)).all() + z = y <= 0 + assert (z.asnumpy() == np.zeros(shape)).all() + z = 1 <= y + assert (z.asnumpy() == np.ones(shape)).all() + + +def test_sparse_nd_binary(): + N = 10 + def check_binary(fn, stype): + for _ in range(N): + ndim = 2 + oshape = np.random.randint(1, 6, size=(ndim,)) + bdim = 2 + lshape = list(oshape) + rshape = list(oshape[ndim-bdim:]) + for i in range(bdim): + sep = np.random.uniform(0, 1) + if sep < 0.33: + lshape[ndim-i-1] = 1 + elif sep < 0.66: + rshape[bdim-i-1] = 1 + lhs = np.random.uniform(0, 1, size=lshape) + rhs = np.random.uniform(0, 1, size=rshape) + lhs_nd = mx.nd.array(lhs).tostype(stype) + rhs_nd = mx.nd.array(rhs).tostype(stype) + assert_allclose(fn(lhs, rhs), fn(lhs_nd, rhs_nd).asnumpy(), rtol=1e-4, atol=1e-4) + + stypes = ['row_sparse', 'csr'] + for stype in stypes: + check_binary(lambda x, y: x + y, stype) + check_binary(lambda x, y: x - y, stype) + check_binary(lambda x, y: x * y, stype) + check_binary(lambda x, y: x / y, stype) + check_binary(lambda x, y: x ** y, stype) + check_binary(lambda x, y: x > y, stype) + check_binary(lambda x, y: x < y, stype) + check_binary(lambda x, y: x >= y, stype) + check_binary(lambda x, y: x <= y, stype) + check_binary(lambda x, y: x == y, stype) + + +def test_sparse_nd_binary_rop(): + N = 10 + def check(fn, stype): + for _ in range(N): + ndim = 2 + shape = np.random.randint(1, 6, size=(ndim,)) + npy = np.random.normal(0, 1, size=shape) + nd = mx.nd.array(npy).tostype(stype) + assert_allclose(fn(npy), fn(nd).asnumpy(), rtol=1e-4, atol=1e-4) + + stypes = ['row_sparse', 'csr'] + for stype in stypes: + check(lambda x: 1 + x, stype) + check(lambda x: 1 - x, stype) + check(lambda x: 1 * x, stype) + check(lambda x: 1 / x, stype) + check(lambda x: 2 ** x, stype) + check(lambda x: 1 > x, stype) + check(lambda x: 0.5 > x, stype) + check(lambda x: 0.5 < x, stype) + check(lambda x: 0.5 >= x, stype) + check(lambda x: 0.5 <= x, stype) + check(lambda x: 0.5 == x, stype) + +def test_sparse_nd_binary_iop(): + N = 10 + def check_binary(fn, stype): + for _ in range(N): + ndim = 2 + oshape = np.random.randint(1, 6, size=(ndim,)) + lshape = list(oshape) + rshape = list(oshape) + lhs = np.random.uniform(0, 1, size=lshape) + rhs = np.random.uniform(0, 1, size=rshape) + lhs_nd = mx.nd.array(lhs).tostype(stype) + rhs_nd = mx.nd.array(rhs).tostype(stype) + assert_allclose(fn(lhs, rhs), + fn(lhs_nd, rhs_nd).asnumpy(), + rtol=1e-4, atol=1e-4) + + def inplace_add(x, y): + x += y + return x + def inplace_mul(x, y): + x *= y + return x + stypes = ['csr', 'row_sparse'] + fns = [inplace_add, inplace_mul] + for stype in stypes: + for fn in fns: + check_binary(fn, stype) + +def test_sparse_nd_negate(): + def check_sparse_nd_negate(shape, stype): + npy = np.random.uniform(-10, 10, rand_shape_2d()) + arr = mx.nd.array(npy).tostype(stype) + assert_almost_equal(npy, arr.asnumpy()) + assert_almost_equal(-npy, (-arr).asnumpy()) + + # a final check to make sure the negation (-) is not implemented + # as inplace operation, so the contents of arr does not change after + # we compute (-arr) + assert_almost_equal(npy, arr.asnumpy()) + + shape = rand_shape_2d() + stypes = ['csr', 'row_sparse'] + for stype in stypes: + check_sparse_nd_negate(shape, stype) + +def test_sparse_nd_broadcast(): + sample_num = 1000 + # TODO(haibin) test with more than 2 dimensions + def test_broadcast_to(stype): + for i in range(sample_num): + ndim = 2 + target_shape = np.random.randint(1, 11, size=ndim) + shape = target_shape.copy() + axis_flags = np.random.randint(0, 2, size=ndim) + axes = [] + for (axis, flag) in enumerate(axis_flags): + if flag: + shape[axis] = 1 + dat = np.random.rand(*shape) - 0.5 + numpy_ret = dat + ndarray = mx.nd.array(dat).tostype(stype) + ndarray_ret = ndarray.broadcast_to(shape=target_shape) + if type(ndarray_ret) is mx.ndarray.NDArray: + ndarray_ret = ndarray_ret.asnumpy() + assert (ndarray_ret.shape == target_shape).all() + err = np.square(ndarray_ret - numpy_ret).mean() + assert err < 1E-8 + stypes = ['csr', 'row_sparse'] + for stype in stypes: + test_broadcast_to(stype) + + +def test_sparse_nd_transpose(): + npy = np.random.uniform(-10, 10, rand_shape_2d()) + stypes = ['csr', 'row_sparse'] + for stype in stypes: + nd = mx.nd.array(npy).tostype(stype) + assert_almost_equal(npy.T, (nd.T).asnumpy()) + +def test_sparse_nd_output_fallback(): + shape = (10, 10) + out = mx.nd.zeros(shape=shape, stype='row_sparse') + mx.nd.random_normal(shape=shape, out=out) + assert(np.sum(out.asnumpy()) != 0) + +def test_sparse_nd_random(): + """ test sparse random operator on cpu """ + # gpu random operator doesn't use fixed seed + if default_context().device_type is 'gpu': + return + shape = (100, 100) + fns = [mx.nd.random_uniform, mx.nd.random_normal, mx.nd.random_gamma] + for fn in fns: + rsp_out = mx.nd.zeros(shape=shape, stype='row_sparse') + dns_out = mx.nd.zeros(shape=shape, stype='default') + mx.random.seed(0) + np.random.seed(0) + fn(shape=shape, out=dns_out) + mx.random.seed(0) + np.random.seed(0) + fn(shape=shape, out=rsp_out) + assert_almost_equal(dns_out.asnumpy(), rsp_out.asnumpy()) + + +def test_sparse_nd_astype(): + stypes = ['row_sparse', 'csr'] + for stype in stypes: + x = mx.nd.zeros(shape=rand_shape_2d(), stype=stype, dtype='float32') + y = x.astype('int32') + assert(y.dtype == np.int32), y.dtype + + +def test_sparse_nd_pickle(): + np.random.seed(0) + repeat = 10 + dim0 = 40 + dim1 = 40 + stypes = ['row_sparse', 'csr'] + densities = [0, 0.01, 0.1, 0.2, 0.5] + stype_dict = {'row_sparse': RowSparseNDArray, 'csr': CSRNDArray} + for _ in range(repeat): + shape = rand_shape_2d(dim0, dim1) + for stype in stypes: + for density in densities: + a, _ = rand_sparse_ndarray(shape, stype, density) + assert isinstance(a, stype_dict[stype]) + data = pkl.dumps(a) + b = pkl.loads(data) + assert isinstance(b, stype_dict[stype]) + assert same(a.asnumpy(), b.asnumpy()) + + +def test_sparse_nd_save_load(): + np.random.seed(0) + repeat = 1 + stypes = ['default', 'row_sparse', 'csr'] + stype_dict = {'default': NDArray, 'row_sparse': RowSparseNDArray, 'csr': CSRNDArray} + num_data = 20 + densities = [0, 0.01, 0.1, 0.2, 0.5] + fname = 'tmp_list.bin' + for _ in range(repeat): + data_list1 = [] + for i in range(num_data): + stype = stypes[np.random.randint(0, len(stypes))] + shape = rand_shape_2d(dim0=40, dim1=40) + density = densities[np.random.randint(0, len(densities))] + data_list1.append(rand_ndarray(shape, stype, density)) + assert isinstance(data_list1[-1], stype_dict[stype]) + mx.nd.save(fname, data_list1) + + data_list2 = mx.nd.load(fname) + assert len(data_list1) == len(data_list2) + for x, y in zip(data_list1, data_list2): + assert same(x.asnumpy(), y.asnumpy()) + + data_map1 = {'ndarray xx %s' % i: x for i, x in enumerate(data_list1)} + mx.nd.save(fname, data_map1) + data_map2 = mx.nd.load(fname) + assert len(data_map1) == len(data_map2) + for k, x in data_map1.items(): + y = data_map2[k] + assert same(x.asnumpy(), y.asnumpy()) + os.remove(fname) + +def test_sparse_nd_unsupported(): + nd = mx.nd.zeros((2,2), stype='row_sparse') + fn_slice = lambda x: x._slice(None, None) + fn_at = lambda x: x._at(None) + fn_reshape = lambda x: x.reshape(None) + fns = [fn_slice, fn_at, fn_reshape] + for fn in fns: + try: + fn(nd) + assert(False) + except: + pass + +def test_create_csr(): + dim0 = 50 + dim1 = 50 + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + matrix = rand_ndarray(shape, 'csr', density) + data = matrix.data + indptr = matrix.indptr + indices = matrix.indices + csr_created = mx.nd.sparse.csr_matrix(data=data, indptr=indptr, + indices=indices, shape=shape) + assert csr_created.stype == 'csr' + assert same(csr_created.data.asnumpy(), data.asnumpy()) + assert same(csr_created.indptr.asnumpy(), indptr.asnumpy()) + assert same(csr_created.indices.asnumpy(), indices.asnumpy()) + csr_copy = mx.nd.array(csr_created) + assert(same(csr_copy.asnumpy(), csr_created.asnumpy())) + + +def test_create_row_sparse(): + dim0 = 50 + dim1 = 50 + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + matrix = rand_ndarray(shape, 'row_sparse', density) + data = matrix.data + indices = matrix.indices + rsp_created = mx.nd.sparse.row_sparse_array(data=data, indices=indices, shape=shape) + assert rsp_created.stype == 'row_sparse' + assert same(rsp_created.data.asnumpy(), data.asnumpy()) + assert same(rsp_created.indices.asnumpy(), indices.asnumpy()) + rsp_copy = mx.nd.array(rsp_created) + assert(same(rsp_copy.asnumpy(), rsp_created.asnumpy())) + +def test_sparse_nd_empty(): + stypes = ['csr', 'row_sparse', 'default'] + for stype in stypes: + nd = mx.nd.empty((2,2), stype=stype) + assert(nd.stype == stype) + + +def test_synthetic_dataset_generator(): + def test_powerlaw_generator(csr_arr, final_row=1): + """Test power law distribution + Total Elements: 32000, Number of zeros: 3200 + Every row has 2 * non zero elements of the previous row. + Also since (2047 < 3200 < 4095) this will be true till 10th row""" + indices = csr_arr.indices.asnumpy() + indptr = csr_arr.indptr.asnumpy() + for row in range(1, final_row + 1): + nextrow = row + 1 + current_row_nnz = indices[indptr[row] - 1] + 1 + next_row_nnz = indices[indptr[nextrow] - 1] + 1 + assert next_row_nnz == 2 * current_row_nnz + + # Test if density is preserved + csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr", + density=0.01, distribution="powerlaw") + + csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr", + density=0.5, distribution="powerlaw") + + csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr", + density=0.4, distribution="powerlaw") + + csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr", + density=0.5, distribution="powerlaw") + assert len(csr_arr_cols.data) == 3200 + test_powerlaw_generator(csr_arr_cols, final_row=9) + test_powerlaw_generator(csr_arr_small, final_row=1) + test_powerlaw_generator(csr_arr_big, final_row=4) + test_powerlaw_generator(csr_arr_square, final_row=6) + + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py new file mode 100644 index 000000000000..2875d7b4b645 --- /dev/null +++ b/tests/python/unittest/test_sparse_operator.py @@ -0,0 +1,373 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from mxnet.test_utils import * + + +def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None): + lhs = mx.symbol.Variable('lhs', stype=lhs_stype) + rhs = mx.symbol.Variable('rhs', stype=rhs_stype) + lhs_nd = rand_ndarray(shape, lhs_stype) + rhs_nd = rand_ndarray(shape, rhs_stype) + lhs_np = lhs_nd.asnumpy() + rhs_np = rhs_nd.asnumpy() + + out_np = lhs_np + rhs_np + test = mx.symbol.sparse.elemwise_add(lhs, rhs) + location = {'lhs': lhs_nd, 'rhs': rhs_nd} + check_symbolic_forward(test, location, [out_np]) + check_numeric_gradient(test, location) + grad_stypes = {} + if lhs_grad_stype is not None and lhs_grad_stype != 'default': + grad_stypes['lhs'] = lhs_grad_stype + if rhs_grad_stype is not None and rhs_grad_stype != 'default': + grad_stypes['rhs'] = rhs_grad_stype + check_symbolic_backward(test, location, [out_np], [out_np, out_np], + grad_stypes=grad_stypes) + + +def test_elemwise_add_ex(): + shapes = [rand_shape_2d(), rand_shape_3d()] + for shape in shapes: + check_elemwise_add_ex('default', 'default', shape) + check_elemwise_add_ex('default', 'row_sparse', shape) + check_elemwise_add_ex('row_sparse', 'default', shape) + check_elemwise_add_ex('row_sparse', 'row_sparse', shape, + lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse') + + +# TODO(haibin) randomize this test +def test_elemwise_add_ex_multiple_stages(): + # prep data + shape = (4, 2) + ds_np = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + sp_np1 = np.array([[5, 10], [0, 0], [0, 0], [0, 0]]) + sp_np2 = np.array([[0, 0], [5, 10], [0, 0], [0, 0]]) + + val1 = mx.nd.array([[5, 10]]); + val2 = mx.nd.array([[5, 10]]); + idx1 = mx.nd.array([0], dtype=np.int64); + idx2 = mx.nd.array([1], dtype=np.int64); + sp_nd1 = mx.nd.sparse.row_sparse_array(val1, idx1, shape) + sp_nd2 = mx.nd.sparse.row_sparse_array(val2, idx2, shape) + ds_nd = mx.nd.array(ds_np) + + # sparse + sparse = sparse + sp_data1 = mx.symbol.Variable('sp_data1', stype='row_sparse') + sp_data2 = mx.symbol.Variable('sp_data2', stype='row_sparse') + ds_data = mx.symbol.Variable('ds_data') + plus = mx.symbol.sparse.elemwise_add(sp_data1, sp_data2, name='plus') + # sparse + dense = dense + test = mx.symbol.sparse.elemwise_add(plus, ds_data) + check_symbolic_forward(test, {'sp_data1': sp_nd1, 'sp_data2': sp_nd2, + 'ds_data': ds_nd}, [sp_np1 + sp_np2 + ds_np]) + + arr_grads = [mx.nd.zeros(shape) for i in range(3)] + exec_test = test.bind(default_context(), args={'sp_data1': sp_nd1, 'sp_data2': sp_nd2, + 'ds_data': ds_nd}, args_grad=arr_grads) + exec_test.forward(is_train=True) + assert_almost_equal(exec_test.outputs[0].asnumpy(), sp_np1 + sp_np2 + ds_np) + exec_test.backward(out_grads=exec_test.outputs) + assert_almost_equal(arr_grads[0].asnumpy(), arr_grads[1].asnumpy()) + +def test_cast_storage_ex(): + def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=True): + x = mx.symbol.Variable('x', stype=from_stype) + x_nd = rand_ndarray(shape, from_stype, density=density) + x_np = x_nd.asnumpy() + out_np = x_np + test = mx.symbol.cast_storage(x, stype=to_stype) + location = {'x': x_nd} + check_symbolic_forward(test, location, [out_np]) + # consider disable the numeric grad check for gpu block kernel since the input is large + if check_numeric_grad: + check_numeric_gradient(test, location) + grad_stypes = {'x': to_stype} + check_symbolic_backward(test, location, [out_np], [out_np], grad_stypes=grad_stypes) + + density = [1.00, 0.50, 0.10, 0.05, 0.01] + for d in density: + shape_2d = rand_shape_2d() + shape_3d = rand_shape_3d() + check_cast_storage(shape_2d, d, 'csr', 'default') + check_cast_storage(shape_2d, d, 'default', 'csr') + check_cast_storage(shape_2d, d, 'row_sparse', 'default') + check_cast_storage(shape_2d, d, 'default', 'row_sparse') + check_cast_storage(shape_3d, d, 'row_sparse', 'default') + check_cast_storage(shape_3d, d, 'default', 'row_sparse') + for i in range(4, 6): + shape = rand_shape_nd(i, 5) + check_cast_storage(shape, d, 'default', 'row_sparse') + check_cast_storage(shape, d, 'row_sparse', 'default') + # Test specific gpu kernels + if default_context().device_type is 'gpu': + dim0 = rnd.randint(1, 10) + # test gpu thread kernel + check_cast_storage((dim0, rnd.randint( 1, 32)), d, 'default', 'csr') + # test gpu warp kernel + check_cast_storage((dim0, rnd.randint( 32, 512)), d, 'default', 'csr') + # test gpu block kernel + check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'csr', + check_numeric_grad=False) + # test gpu thread kernel + check_cast_storage((dim0, rnd.randint( 1, 32)), d, 'default', 'row_sparse') + # test gpu warp kernel + check_cast_storage((dim0, rnd.randint( 32, 512)), d, 'default', 'row_sparse') + # test gpu block kernel + check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'row_sparse', + check_numeric_grad=False) + +def test_sparse_dot(): + def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_density): + lhs_nd = rand_ndarray(lhs_shape, 'csr', density=lhs_density) + lhs_dns = lhs_nd.tostype('default') + rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_density) + rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default') + + out = mx.nd.dot(lhs_nd, rhs_nd, transpose_a=trans_lhs) + out_dns = mx.nd.dot(lhs_dns, rhs_dns, transpose_a=trans_lhs) + out_np = out_dns.asnumpy() + assert_almost_equal(out.asnumpy(), out_np, rtol=1e-4, atol=1e-5) + + # test symbolic forward + lhs = mx.symbol.Variable('lhs', stype='csr') + rhs = mx.symbol.Variable('rhs', stype=rhs_stype) + out = mx.symbol.sparse.dot(lhs, rhs, transpose_a=trans_lhs) + location = {'lhs': lhs_nd, 'rhs': rhs_nd} + check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4) + + # test symbolic backward + backward_trans = not trans_lhs + rhs_backward_grad = mx.nd.dot(lhs_dns, out_dns, transpose_a=backward_trans).asnumpy() + expected = {'rhs': rhs_backward_grad} + check_symbolic_backward(out, location, [out_np], expected, + grad_req={'lhs': 'null', 'rhs': 'write'}, + rtol=1e-3, atol=1e-4) + + density = [1.00, 0.50, 0.10, 0.05, 0.01] + for lhs_d in density: + lhs_shape = rand_shape_2d(50, 200) + rhs_d = 1 + test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d) # test gpu SpMV + test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True , lhs_d, rhs_d) # (vector kernel) + test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d) # test gpu SpMM + test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True , lhs_d, rhs_d) # (scalar kernel) + for rhs_d in density: + test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False, lhs_d, rhs_d) + test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True, lhs_d, rhs_d) + + +def test_sparse_slice(): + def check_csr_slice(shape, slice_input): + storage_type = 'csr' + B, _ = rand_sparse_ndarray(shape, storage_type) + np = B.asnumpy() + begin = rnd.randint(0, B.shape[0] - 1) + end = rnd.randint(begin + 1, B.shape[0]) + nd_slice = mx.nd.crop(B, begin=begin, end=end) + assert same(nd_slice.asnumpy(), np[begin:end]), (nd_slice.asnumpy(), np[begin:end]) + + shape = (rnd.randint(7, 15), rnd.randint(1, 10)) + check_csr_slice(shape, True) + check_csr_slice(shape, False) + + +def test_sparse_retain(): + def check_sparse_retain(shape, density, index_type=np.int64): + num_rows = shape[0] + rsp, _ = rand_sparse_ndarray(shape=shape, stype='row_sparse', density=density) + length = np.random.randint(1, num_rows + 1) + idx = random_sample(list(range(0, num_rows)), length) + idx.sort() + dns = rsp.asnumpy() + tensor_retained_expected = np.zeros(shape) + for i in idx: + tensor_retained_expected[i][:] = dns[i] + indices = mx.nd.array(idx, dtype=index_type) + rsp_retained = mx.nd.sparse.retain(rsp, indices=indices) + assert same(tensor_retained_expected, rsp_retained.asnumpy()) + + # check numeric gradient + data = mx.symbol.Variable('data') + idx = mx.symbol.Variable('indices') + sym = mx.sym.sparse.retain(data=data, indices=idx) + check_numeric_gradient(sym, [rsp, indices], grad_nodes=['data'], + grad_stype_dict={'data': 'row_sparse'}) + + shape = rand_shape_2d() + shape_3d = rand_shape_3d() + densities = [0.01, 0.1, 0.2, 0.5, 0.8, 1.0] + index_types = [np.float32, np.int32, np.int64] + for density in densities: + for itype in index_types: + check_sparse_retain(shape, density, itype) + check_sparse_retain(shape_3d, density, itype) + + +def test_sparse_nd_zeros(): + def check_sparse_nd_zeros(stype, shape): + zero = mx.nd.zeros(shape) + sparse_zero = mx.nd.zeros(shape=shape, stype=stype) + assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy()) + + shape = rand_shape_2d() + check_sparse_nd_zeros('row_sparse', shape) + check_sparse_nd_zeros('csr', shape) + check_sparse_nd_zeros('default', shape) + + +def test_sparse_square_sum(): + dim0 = 30 + dim1 = 30 + axes = [0, 1] + keepdims = [False, True] + densities = [0, 0.01, 0.1, 0.2, 0.5] + for density in densities: + shape = rand_shape_2d(dim0, dim1) + rsp = rand_ndarray(shape, 'row_sparse', density) + dns = rsp.tostype('default') + for axis in axes: + for keepdim in keepdims: + ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim) + if axis == 1 and keepdim: + assert ret.stype == 'row_sparse' + else: + assert ret.stype == 'default' + ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim) + # check forward result + assert same(ret.asnumpy(), ret_expected.asnumpy()) + + rsp_data = mx.sym.Variable('data', stype='row_sparse') + test = mx.symbol._internal._square_sum(rsp_data, axis=axis, keepdims=keepdim) + + # check symbolic backward since ograd can be a rsp + # and cannot be checked through check_numeric_gradient + # because it will add a loss layer as the output layer + # which makes ograd of the square_sum dense + if axis == 1 and keepdims: + dns_data = mx.sym.Variable('data') + baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim) + igrad_expected = mx.nd.empty(dns.shape) + baseline_exec = baseline.bind(default_context(), args=[dns], + args_grad=[igrad_expected]) + baseline_exec.forward(is_train=True) + baseline_exec.backward([ret_expected]) + check_symbolic_backward(test, [rsp], [ret], [igrad_expected.asnumpy()], + grad_stypes={'data': 'row_sparse'}) + + # check numeric gradient + check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'}, + atol=1e-2, rtol=0.1) + +def test_sparse_storage_fallback(): + """ test operators which don't implement FComputeEx or FStatefulComputeEx """ + def check_broadcast_add(shape, lhs_stype, rhs_stype): + lhs = mx.symbol.Variable('lhs', stype=lhs_stype) + rhs = mx.symbol.Variable('rhs', stype=rhs_stype) + lhs_nd = rand_ndarray(shape, lhs_stype) + rhs_nd = rand_ndarray(shape, rhs_stype) + lhs_dns = mx.nd.cast_storage(lhs_nd, stype='default') + rhs_dns = mx.nd.cast_storage(rhs_nd, stype='default') + + out_dns = (lhs_dns + rhs_dns).asnumpy() + test = mx.symbol.broadcast_add(lhs, rhs) + location = {'lhs': lhs_nd, 'rhs': rhs_nd} + check_symbolic_forward(test, location, [out_dns]) + check_numeric_gradient(test, location) + check_symbolic_backward(test, location, [out_dns], [out_dns, out_dns]) + + def np_softmax(x, axis=-1): + # fix for old numpy on Travis not supporting keepdims + # x = x - np.max(x, axis=-1, keepdims=True) + x = x - np.max(x, axis=axis, keepdims=True) + x = np.exp(x) + # x /= np.sum(x, axis=-1, keepdims=True) + x /= np.sum(x, axis=axis, keepdims=True) + return x + + def check_softmax_with_shape(lhs_stype, rhs_stype, shape, preserve_shape=False): + # bind with label + ctx = default_context() + X = mx.symbol.Variable('X', stype=lhs_stype) + L = mx.symbol.Variable('L', stype=rhs_stype) + Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape) + x = rand_ndarray(shape, lhs_stype) + l = rand_ndarray(shape, rhs_stype) + l[:] = np_softmax(l.asnumpy()) + grad = mx.nd.empty(shape, ctx=ctx) + exec1 = Y.bind(ctx, args = [x, l], args_grad = {'X': grad}) + exec1.forward(is_train=True) + out = exec1.outputs[0].asnumpy() + assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=1e-4) + exec1.backward() + assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(), + rtol=1e-3, atol=1e-4) + + def check_concat(shape, lhs_stype, rhs_stype): + x = mx.symbol.Variable('x', stype=lhs_stype) + w = mx.symbol.Variable('w', stype=rhs_stype) + test = mx.sym.Concat(x, w) + x_nd = rand_ndarray(shape, lhs_stype) + w_nd = rand_ndarray(shape, rhs_stype) + location = {'x': x_nd, 'w': w_nd} + check_numeric_gradient(test, location) + + shape = rand_shape_2d() + stypes = ['default', 'csr', 'row_sparse'] + for lhs in stypes: + for rhs in stypes: + check_broadcast_add(shape, lhs, rhs) + check_concat(shape, lhs, rhs) + check_softmax_with_shape(lhs, rhs, shape, preserve_shape=False) + check_softmax_with_shape(rhs, rhs, shape, preserve_shape=True) + + +def test_sparse_elementwise_sum(): + def check_sparse_elementwise_sum_with_shape(stype, shape, n): + # forward + inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)] + out = mx.symbol.sparse.add_n(*inputs, name='esum') + arr = [] + arr_grad = [mx.nd.empty(shape) for _ in range(n)] + densities = [0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5] + for i in range(n): + arr.append(rand_ndarray(shape, stype, np.random.randint(0, len(densities)))) + + exec1 = out.bind(default_context(), + args=arr, + args_grad=arr_grad) + exec1.forward(is_train=True) + out1 = exec1.outputs[0].asnumpy() + out = sum(a.asnumpy() for a in arr) + assert_almost_equal(out, out1) + + out_grad = mx.nd.empty(shape) + out_grad[:] = np.random.uniform(-10, 10, shape) + # backward + exec1.backward([out_grad]) + for a in arr_grad: + assert_almost_equal(a.asnumpy(), out_grad.asnumpy()) + + maxdim = 5 + for dim in range(2, maxdim): + shape = tuple(np.random.randint(5, 10, size=dim)) + check_sparse_elementwise_sum_with_shape('row_sparse', shape, np.random.randint(1, 9)) + + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index fb1869f842b1..fd23f0e82b24 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -117,21 +117,21 @@ if [ ${TASK} == "python_test" ]; then mkdir -p ${PWD}/data if [ ${TRAVIS_OS_NAME} == "osx" ]; then - python -m nose tests/python/unittest || exit -1 - python3 -m nose tests/python/unittest || exit -1 + python -m nose -v tests/python/unittest || exit -1 + python3 -m nose -v tests/python/unittest || exit -1 # make cython3 # cython tests # export MXNET_ENFORCE_CYTHON=1 # python3 -m nose tests/python/unittest || exit -1 - python3 -m nose tests/python/train || exit -1 - python -m nose tests/python/doctest || exit -1 - python3 -m nose tests/python/doctest || exit -1 + python3 -m nose -v tests/python/train || exit -1 + python -m nose -v tests/python/doctest || exit -1 + python3 -m nose -v tests/python/doctest || exit -1 else - nosetests tests/python/unittest || exit -1 - nosetests3 tests/python/unittest || exit -1 - nosetests3 tests/python/train || exit -1 - nosetests tests/python/doctest || exit -1 - nosetests3 tests/python/doctest || exit -1 + nosetests -v tests/python/unittest || exit -1 + nosetests3 -v tests/python/unittest || exit -1 + nosetests3 -v tests/python/train || exit -1 + nosetests -v tests/python/doctest || exit -1 + nosetests3 -v tests/python/doctest || exit -1 fi exit 0 fi diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 94d674f3943e..f479306a31a8 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -33,8 +33,8 @@ if [ ${TRAVIS_OS_NAME} == "osx" ]; then brew install ImageMagick brew install swig if [ ${TASK} == "python_test" ]; then - python -m pip install --user nose numpy cython - python3 -m pip install --user nose numpy cython + python -m pip install --user nose numpy cython scipy + python3 -m pip install --user nose numpy cython scipy fi fi From 54b92401d7677040bdccb46e2c5a4a95a4c6130d Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Tue, 22 Aug 2017 17:03:07 -0700 Subject: [PATCH 017/448] add flatten option to fc (#7548) * add last_axis option to fc * update per comments * clean up --- python/mxnet/gluon/nn/basic_layers.py | 30 +++++++++----- src/operator/fully_connected-inl.h | 57 +++++++++++++++++++++------ src/operator/fully_connected.cc | 13 ++++-- tests/python/unittest/test_gluon.py | 24 +++++++++-- 4 files changed, 97 insertions(+), 27 deletions(-) diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py index 7901a7ae2350..2c9ff49db1e9 100644 --- a/python/mxnet/gluon/nn/basic_layers.py +++ b/python/mxnet/gluon/nn/basic_layers.py @@ -102,7 +102,7 @@ def __len__(self): class Dense(HybridBlock): - """Just your regular densely-connected NN layer. + r"""Just your regular densely-connected NN layer. `Dense` implements the operation: `output = activation(dot(input, weight) + bias)` @@ -124,6 +124,11 @@ class Dense(HybridBlock): (ie. "linear" activation: `a(x) = x`). use_bias : bool Whether the layer uses a bias vector. + flatten: bool + Whether the input tensor should be flattened. + If true, all but the first axis of input data are collapsed together. + If false, all but the last axis of input data are kept the same, and the transformation + applies on the last axis. weight_initializer : str or `Initializer` Initializer for the `kernel` weights matrix. bias_initializer: str or `Initializer` @@ -138,16 +143,27 @@ class Dense(HybridBlock): See document of `Block`. + If ``flatten`` is set to be True, then the shapes are: Input shape: - A 2D input with shape `(batch_size, in_units)`. + An N-D input with shape + `(batch_size, x1, x2, ..., xn) with x1 * x2 * ... * xn equal to in_units`. Output shape: The output would have shape `(batch_size, units)`. + + If ``flatten`` is set to be false, then the shapes are: + Input shape: + An N-D input with shape + `(x1, x2, ..., xn, in_units)`. + + Output shape: + The output would have shape `(x1, x2, ..., xn, units)`. """ - def __init__(self, units, activation=None, use_bias=True, + def __init__(self, units, activation=None, use_bias=True, flatten=True, weight_initializer=None, bias_initializer='zeros', in_units=0, **kwargs): super(Dense, self).__init__(**kwargs) + self._flatten = flatten with self.name_scope(): self._units = units self._in_units = in_units @@ -166,12 +182,8 @@ def __init__(self, units, activation=None, use_bias=True, self.act = None def hybrid_forward(self, F, x, weight, bias=None): - if bias is None: - act = F.FullyConnected(x, weight, no_bias=True, num_hidden=self._units, - name='fwd') - else: - act = F.FullyConnected(x, weight, bias, num_hidden=self._units, - name='fwd') + act = F.FullyConnected(x, weight, bias, no_bias=bias is None, num_hidden=self._units, + flatten=self._flatten, name='fwd') if self.act is not None: act = self.act(act) return act diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h index cf13655d9c97..6f0cf544d633 100644 --- a/src/operator/fully_connected-inl.h +++ b/src/operator/fully_connected-inl.h @@ -48,12 +48,15 @@ enum FullyConnectedOpOutputs {kOut}; struct FullyConnectedParam : public dmlc::Parameter { int num_hidden; bool no_bias; + bool flatten; DMLC_DECLARE_PARAMETER(FullyConnectedParam) { // TODO(bing) add support for boolean DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1) .describe("Number of hidden nodes of the output."); DMLC_DECLARE_FIELD(no_bias).set_default(false) .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(flatten).set_default(true) + .describe("Whether to collapse all but the first axis of the input data tensor."); } }; @@ -91,11 +94,20 @@ class FullyConnectedOp : public Operator { const TShape& ishape = in_data[fullc::kData].shape_; const TShape& oshape = out_data[fullc::kOut].shape_; - Tensor data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + Tensor data, out; + if (!param_.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + } + // Legacy approach shown here for comparison: // out = dot(data, wmat.T()); linalg_gemm(data, wmat, out, false, true, s); @@ -124,11 +136,23 @@ class FullyConnectedOp : public Operator { const TShape& ishape = in_data[fullc::kData].shape_; const TShape& oshape = out_grad[fullc::kOut].shape_; - Tensor data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + Tensor data, grad, gdata; + if (!param_.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + } #if defined(__CUDACC__) CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) @@ -147,8 +171,6 @@ class FullyConnectedOp : public Operator { Assign(gbias, req[fullc::kBias], sum_rows(grad)); } // gradient of data - Tensor gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); // Legacy approach shown here for comparison: // Assign(gdata, req[fullc::kData], dot(grad, wmat)); linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); @@ -199,13 +221,24 @@ class FullyConnectedProp : public OperatorProperty { // require data to be known if (dshape.ndim() == 0) return false; - index_t num_input = dshape.ProdShape(1, dshape.ndim()); + index_t num_input; + if (!param_.flatten) { + num_input = dshape[dshape.ndim()-1]; + } else { + num_input = dshape.ProdShape(1, dshape.ndim()); + } SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input)); if (!param_.no_bias) { SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden)); } - SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden)); + if (!param_.flatten) { + TShape result_shape(dshape); + result_shape[dshape.ndim()-1] = param_.num_hidden; + SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); + } else { + SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden)); + } if (oshape.ndim() != 0) { dshape[0] = oshape[0]; SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc index 5dbaf8c82005..82c32a7d2546 100644 --- a/src/operator/fully_connected.cc +++ b/src/operator/fully_connected.cc @@ -76,13 +76,20 @@ DMLC_REGISTER_PARAMETER(FullyConnectedParam); MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp) .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`. -Shapes: +If ``flatten`` is set to be true, then the shapes are: -- **data**: `(batch_size, input_dim)` -- **weight**: `(num_hidden, input_dim)` +- **data**: `(batch_size, x1, x2, ..., xn)` +- **weight**: `(num_hidden, x1 * x2 * ... * xn)` - **bias**: `(num_hidden,)` - **out**: `(batch_size, num_hidden)` +If ``flatten`` is set to be false, then the shapes are: + +- **data**: `(x1, x2, ..., xn, input_dim)` +- **weight**: `(num_hidden, input_dim)` +- **bias**: `(num_hidden,)` +- **out**: `(x1, x2, ..., xn, num_hidden)` + The learnable parameters include both ``weight`` and ``bias``. If ``no_bias`` is set to be true, then the ``bias`` term is ignored. diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index cafa08bc04ca..726213dd5455 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -67,9 +67,9 @@ def forward(self, x): def test_basic(): model = nn.Sequential() - model.add(nn.Dense(128, activation='tanh', in_units=10)) + model.add(nn.Dense(128, activation='tanh', in_units=10, flatten=False)) model.add(nn.Dropout(0.5)) - model.add(nn.Dense(64, activation='tanh', in_units=128)) + model.add(nn.Dense(64, activation='tanh', in_units=256)) model.add(nn.Dense(32, in_units=64)) model.add(nn.Activation('relu')) @@ -80,7 +80,7 @@ def test_basic(): # ndarray model.collect_params().initialize(mx.init.Xavier(magnitude=2.24)) - x = model(mx.nd.zeros((32, 10))) + x = model(mx.nd.zeros((32, 2, 10))) assert x.shape == (32, 32) x.wait_to_read() @@ -90,6 +90,24 @@ def test_basic(): assert list(model.collect_params().values())[0]._grad is not None +def test_dense(): + model = nn.Dense(128, activation='tanh', in_units=10, flatten=False, prefix='test_') + inputs = mx.sym.Variable('data') + outputs = model(inputs) + assert set(model.collect_params().keys()) == set(['test_weight', 'test_bias']) + assert outputs.list_outputs() == ['test_tanh_fwd_output'] + args, outs, auxs = outputs.infer_shape(data=(2, 3, 10)) + assert outs == [(2, 3, 128)] + + model = nn.Dense(128, activation='relu', in_units=30, flatten=True, prefix='test2_') + inputs = mx.sym.Variable('data') + outputs = model(inputs) + assert set(model.collect_params().keys()) == set(['test2_weight', 'test2_bias']) + assert outputs.list_outputs() == ['test2_relu_fwd_output'] + args, outs, auxs = outputs.infer_shape(data=(17, 2, 5, 3)) + assert outs == [(17, 128)] + + def test_symbol_block(): model = nn.HybridSequential() model.add(nn.Dense(128, activation='tanh')) From 68cd9c924a20ce94e31695cfd431b8ddf99d560b Mon Sep 17 00:00:00 2001 From: mbaijal <30911248+mbaijal@users.noreply.github.com> Date: Tue, 22 Aug 2017 17:04:17 -0700 Subject: [PATCH 018/448] Updating the LICENSE and NOTICE Files (#7563) --- LICENSE | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ NOTICE | 40 +++++++++++++-- 2 files changed, 193 insertions(+), 4 deletions(-) diff --git a/LICENSE b/LICENSE index d64569567334..01dfcf46792d 100644 --- a/LICENSE +++ b/LICENSE @@ -200,3 +200,160 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + ======================================================================= + Apache MXNET (incubating) Subcomponents: + + The Apache MXNET (incubating) project contains subcomponents with separate copyright + notices and license terms. Your use of the source code for the these + subcomponents is subject to the terms and conditions of the following + licenses. + + ======================================================================== + Apache-2.0 licenses + ======================================================================== + + The following components are provided under an Apache 2.0 license. + + 1. MXNet Cpp-package - For details, /cpp-package/LICENSE + 2. MXNet rcnn - For details, see, example/rcnn/LICENSE + 3. scala-package - For details, see, scala-package/LICENSE + 4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE + 5. dlpack - For details, see, dlpack/LICENSE + 6. dmlc-core - For details, see, dmlc-core/LICENSE + 7. mshadow - For details, see, mshadow/LICENSE + 8. nnvm/dmlc-core - For details, see, nnvm/dmlc-core/LICENSE + 9. nnvm - For details, see, nnvm/LICENSE + 10. nnvm-fusion - For details, see, nnvm/plugin/nnvm-fusion/LICENSE + 11. ps-lite - For details, see, ps-lite/LICENSE + + ======================================================================== + MIT licenses + ======================================================================== + + 1. Fast R-CNN - For details, see example/rcnn/LICENSE + 2. Faster R-CNN - For details, see example/rcnn/LICENSE + 3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE + + + ======================================================================== + NVIDIA Licenses + ======================================================================== + + 1. Warp-CTC + For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE + + /****************************************************************************** + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + 2. CUB Library + For details, see, cub/LICENSE.TXT + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + ======================================================================== + Other Licenses + ======================================================================== + + 1. Caffe + For details, see, example/rcnn/LICENSE + + LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + CONTRIBUTION AGREEMENT + + By contributing to the BVLC/caffe repository through pull-request, comment, + or otherwise, the contributor releases their content to the + license and copyright terms herein. + + + 2. MS COCO API + For details, see, example/rcnn/LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + The views and conclusions contained in the software and documentation are those + of the authors and should not be interpreted as representing official policies, + either expressed or implied, of the FreeBSD Project. + diff --git a/NOTICE b/NOTICE index 03695607e3e9..2341ea27f67a 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,37 @@ -Apache MXNET (incubating) -Copyright [2015-2017] The Apache Software Foundation + Apache MXNET (incubating) + Copyright 2015-2017 The Apache Software Foundation + + This product includes software developed at + The Apache Software Foundation (http://www.apache.org/). + + Warp-CTC + Copyright (c) 2013, NVIDIA CORPORATION. + + CUB Library + Copyright (c) 2010-2011, Duane Merrill. All rights reserved. + Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. + + Caffe + COPYRIGHT + All contributions by the University of California: + Copyright (c) 2014, 2015, The Regents of the University of California (Regents) + All rights reserved. + All other contributions: + Copyright (c) 2014, 2015, the respective contributors + All rights reserved. + Caffe uses a shared copyright model: each contributor holds copyright over + their contributions to Caffe. The project versioning records all such + contribution and copyright details. If a contributor wants to further mark + their specific copyright on a particular contribution, they should indicate + their copyright solely in the commit message of the change when it is + committed. + + + MS COCO API + Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin + + + + + -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). From 491f81e648639c53a68155585c53c3993a33ead5 Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Tue, 22 Aug 2017 19:57:32 -0700 Subject: [PATCH 019/448] add resnet50_v2 pretrained (#7564) --- python/mxnet/gluon/model_zoo/model_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/mxnet/gluon/model_zoo/model_store.py b/python/mxnet/gluon/model_zoo/model_store.py index e524f215416d..6bc4b2805afd 100644 --- a/python/mxnet/gluon/model_zoo/model_store.py +++ b/python/mxnet/gluon/model_zoo/model_store.py @@ -38,6 +38,7 @@ ('2a903ab21260c85673a78fe65037819a843a1f43', 'resnet50_v1'), ('8aacf80ff4014c1efa2362a963ac5ec82cf92d5b', 'resnet18_v2'), ('0ed3cd06da41932c03dea1de7bc2506ef3fb97b3', 'resnet34_v2'), + ('eb7a368774aa34a12ed155126b641ae7556dad9d', 'resnet50_v2'), ('264ba4970a0cc87a4f15c96e25246a1307caf523', 'squeezenet1.0'), ('33ba0f93753c83d86e1eb397f38a667eaf2e9376', 'squeezenet1.1'), ('dd221b160977f36a53f464cb54648d227c707a05', 'vgg11'), From 393293115701b27694b8f9105f8c6360ebbbc557 Mon Sep 17 00:00:00 2001 From: Stefan Henneking Date: Wed, 23 Aug 2017 12:10:50 -0700 Subject: [PATCH 020/448] fixed minor typo (#7581) --- src/operator/tensor/dot-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h index aaf242e26fe1..ecaf5f9a0e95 100644 --- a/src/operator/tensor/dot-inl.h +++ b/src/operator/tensor/dot-inl.h @@ -71,7 +71,7 @@ void DotForward_(const nnvm::NodeAttrs& attrs, << "dot only supports float32 and float64"; MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, { if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) { - CHECK_NE(req[0], kAddTo) << "AddTo not yet suported"; + CHECK_NE(req[0], kAddTo) << "AddTo not yet supported"; Tensor out = outputs[0].get(s); VectorDot(out, inputs[0].get(s), From 6d9b6a3fd8a251dad2ce846a0ce4ade037ce7bbb Mon Sep 17 00:00:00 2001 From: qingzhouzhen <576591769@qq.com> Date: Thu, 24 Aug 2017 04:33:57 +0800 Subject: [PATCH 021/448] modify parameters counting of FC and CONV (#7568) --- python/mxnet/visualization.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py index 4dbf680c2e3a..aa00488d96a7 100644 --- a/python/mxnet/visualization.py +++ b/python/mxnet/visualization.py @@ -134,12 +134,20 @@ def print_layer_summary(node, out_shape): pre_filter = pre_filter + int(shape[0]) cur_param = 0 if op == 'Convolution': - cur_param = pre_filter * int(node["attr"]["num_filter"]) - for k in _str2tuple(node["attr"]["kernel"]): - cur_param *= int(k) - cur_param += int(node["attr"]["num_filter"]) + if ("no_bias" in node["attr"]) and (node["attr"]["no_bias"] == 'True'): + cur_param = pre_filter * int(node["attr"]["num_filter"]) + for k in _str2tuple(node["attr"]["kernel"]): + cur_param *= int(k) + else: + cur_param = pre_filter * int(node["attr"]["num_filter"]) + for k in _str2tuple(node["attr"]["kernel"]): + cur_param *= int(k) + cur_param += int(node["attr"]["num_filter"]) elif op == 'FullyConnected': - cur_param = pre_filter * (int(node["attr"]["num_hidden"]) + 1) + if ("no_bias" in node["attr"]) and (node["attr"]["no_bias"] == 'True'): + cur_param = pre_filter * (int(node["attr"]["num_hidden"])) + else: + cur_param = (pre_filter+1) * (int(node["attr"]["num_hidden"])) elif op == 'BatchNorm': key = node["name"] + "_output" if show_shape: From f68cb40df7fb0fe5fade47b65eefac07dd35a9b5 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 23 Aug 2017 14:15:55 -0700 Subject: [PATCH 022/448] FP16-I/O conv/deconv to use pseudo-fp16, ignoring MSHADOW_USE_PASCAL. (#7527) * FP16-I/O conv and deconv will use pseudo-fp16, ignoring MSHADOW_USE_PASCAL. * Fixing cpplint error. * Empty commit to trigger CI. --- src/operator/convolution.cu | 55 ++++--------------------------- src/operator/deconvolution.cu | 61 +++++------------------------------ 2 files changed, 14 insertions(+), 102 deletions(-) diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu index f5777c1714a4..b327f3cff424 100644 --- a/src/operator/convolution.cu +++ b/src/operator/convolution.cu @@ -60,61 +60,18 @@ Operator* CreateOp(ConvolutionParam param, int dtype, } #if MXNET_USE_CUDNN == 1 - // The NVIDIA Pascal architecture was the first to include 16-bit ALUs. - // Thus, when the framework is compiled with MSHADOW_USE_PASCAL == 1, we - // perform the convolution calculation in 16-bit when the tensor type is - // also 16-bit. For NVIDIA architectures earlier than Pascal (so Maxwell - // and Kepler), the computation precision is always at least 32-bits. -#if MSHADOW_USE_PASCAL == 1 - // true fp16 - int desired_forward_compute_type = dtype; - int desired_backward_compute_type = dtype; -#else - // pseudo fp16 - int desired_forward_compute_type = - (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; - int desired_backward_compute_type = - (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; -#endif // MSHADOW_USE_PASCAL == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { op = new ConvolutionOp(param); + } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx)) { + LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; + op = new ConvolutionOp(param); } else { - int forward_compute_type = desired_forward_compute_type; - int backward_compute_type = desired_backward_compute_type; - bool convolutionIsSupported = CuDNNConvolutionOp::Supports(param, - forward_compute_type, - backward_compute_type, ctx); - - // If cuDNN can't handle this case with fp16 backprop kernels, try fp32 backprop. - if (!convolutionIsSupported && backward_compute_type == mshadow::kFloat16) { - backward_compute_type = mshadow::kFloat32; - convolutionIsSupported = CuDNNConvolutionOp::Supports(param, - forward_compute_type, - backward_compute_type, ctx); - } - - // If cuDNN can't handle this case with fp16 forward kernels, try fp32 - if (!convolutionIsSupported && forward_compute_type == mshadow::kFloat16) { - forward_compute_type = mshadow::kFloat32; - convolutionIsSupported = CuDNNConvolutionOp::Supports(param, - forward_compute_type, - backward_compute_type, ctx); - } - if (!convolutionIsSupported) { - LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - op = new ConvolutionOp(param); - } else { - if (forward_compute_type != desired_forward_compute_type) - LOG(WARNING) << "Requested forward compute precision not supported, using fp32."; - if (backward_compute_type != desired_backward_compute_type) - LOG(WARNING) << "Requested backward compute precision not supported, using fp32."; - op = new CuDNNConvolutionOp(param, - forward_compute_type, - backward_compute_type, + op = new CuDNNConvolutionOp(param, compute_type, compute_type, *in_shape, *out_shape, ctx); - } } }) #else diff --git a/src/operator/deconvolution.cu b/src/operator/deconvolution.cu index e9b5cb8e3c7f..de7dff5569ed 100644 --- a/src/operator/deconvolution.cu +++ b/src/operator/deconvolution.cu @@ -45,64 +45,19 @@ Operator* CreateOp(DeconvolutionParam param, int dtype, return op; } #if MXNET_USE_CUDNN == 1 - // The NVIDIA Pascal architecture was the first to include 16-bit ALUs. - // Thus, when the framework is compiled with MSHADOW_USE_PASCAL == 1, we - // perform the deconvolution calculation in 16-bit when the tensor type is - // also 16-bit. For NVIDIA architectures earlier than Pascal (so Maxwell - // and Kepler), the computation precision is always at least 32-bits. -#if MSHADOW_USE_PASCAL == 1 - // true fp16 - int desired_forward_compute_type = dtype; - int desired_backward_compute_type = dtype; -#else - // pseudo fp16 - int desired_forward_compute_type = - (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; - int desired_backward_compute_type = - (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; -#endif // MSHADOW_USE_PASCAL == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { op = new DeconvolutionOp(param); + } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx)) { + LOG(WARNING) << + "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; + op = new DeconvolutionOp(param); } else { - int forward_compute_type = desired_forward_compute_type; - int backward_compute_type = desired_backward_compute_type; - bool deconvolutionIsSupported = CuDNNDeconvolutionOp::Supports(param, - forward_compute_type, - backward_compute_type, ctx); - - // If cuDNN can't handle this case with fp16 backprop kernels, try fp32 backprop. - if (!deconvolutionIsSupported && backward_compute_type == mshadow::kFloat16) { - backward_compute_type = mshadow::kFloat32; - deconvolutionIsSupported = CuDNNDeconvolutionOp::Supports(param, - forward_compute_type, - backward_compute_type, ctx); - } - - // If cuDNN can't handle this case with fp16 forward kernels, try fp32 - if (!deconvolutionIsSupported && forward_compute_type == mshadow::kFloat16) { - forward_compute_type = mshadow::kFloat32; - deconvolutionIsSupported = CuDNNDeconvolutionOp::Supports(param, - forward_compute_type, - backward_compute_type, ctx); - } - if (!deconvolutionIsSupported) { - LOG(WARNING) << - "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - op = new DeconvolutionOp(param); - } else { - if ((forward_compute_type != desired_forward_compute_type) || - (backward_compute_type != desired_backward_compute_type)) { - LOG(WARNING) << - "True fp16 deconvolution by cudnn not supported in this configuration. " << - "Falling back to pseudo fp16."; - } - op = new CuDNNDeconvolutionOp(param, - forward_compute_type, - backward_compute_type, - *in_shape, *out_shape, ctx); - } + op = new CuDNNDeconvolutionOp(param, compute_type, compute_type, + *in_shape, *out_shape, ctx); } }) #else From 050d85e338beed7880577b76aaf73c2948fc8a35 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 23 Aug 2017 14:41:17 -0700 Subject: [PATCH 023/448] Set dev_id in streams, also update mshadow. (#7526) * Set dev_id in streams, also update mshadow. * Fix cpplint error. * Empty commit to trigger CI. * Further update of mshadow to match current hash. --- src/common/cuda_utils.h | 19 ++++++++++++------- src/engine/naive_engine.cc | 2 +- src/engine/stream_manager.h | 4 ++-- src/engine/threaded_engine_perdevice.cc | 4 ++-- tests/cpp/include/test_op.h | 3 ++- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h index 0213c73177b3..0f63895d3284 100644 --- a/src/common/cuda_utils.h +++ b/src/common/cuda_utils.h @@ -274,26 +274,31 @@ inline int SMArch(int device_id) { /*! * \brief Determine whether a cuda-capable gpu's architecture supports float16 math. + * Assume not if device_id is negative. * \param device_id The device index of the cuda-capable gpu of interest. * \return whether the gpu's architecture supports float16 math. */ inline bool SupportsFloat16Compute(int device_id) { - // Kepler and most Maxwell GPUs do not support fp16 compute - int computeCapabilityMajor = ComputeCapabilityMajor(device_id); - int computeCapabilityMinor = ComputeCapabilityMinor(device_id); - return (computeCapabilityMajor > 5) || - (computeCapabilityMajor == 5 && computeCapabilityMinor >= 3); + if (device_id < 0) { + return false; + } else { + // Kepler and most Maxwell GPUs do not support fp16 compute + int computeCapabilityMajor = ComputeCapabilityMajor(device_id); + return (computeCapabilityMajor > 5) || + (computeCapabilityMajor == 5 && ComputeCapabilityMinor(device_id) >= 3); + } } /*! * \brief Determine whether a cuda-capable gpu's architecture supports Tensor Core math. + * Assume not if device_id is negative. * \param device_id The device index of the cuda-capable gpu of interest. * \return whether the gpu's architecture supports Tensor Core math. */ inline bool SupportsTensorCore(int device_id) { // Volta (sm_70) supports TensorCore algos - int computeCapabilityMajor = ComputeCapabilityMajor(device_id); - return (computeCapabilityMajor >= 7); + return device_id >= 0 && + ComputeCapabilityMajor(device_id) >=7; } // The policy if the user hasn't set the environment variable MXNET_CUDA_ALLOW_TENSOR_CORE diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc index 85ec3ae672e2..b354418288aa 100644 --- a/src/engine/naive_engine.cc +++ b/src/engine/naive_engine.cc @@ -154,7 +154,7 @@ class NaiveEngine final : public Engine { streams_.resize(dev_id + 1, nullptr); } if (streams_[dev_id] == nullptr) { - streams_[dev_id] = mshadow::NewStream(true, MXNET_USE_CUDNN != 0); + streams_[dev_id] = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, dev_id); } exec_fun(RunContext{exec_ctx, streams_[dev_id]}, callback); #else diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h index 1a66277bb4ec..cd6db53f14c6 100644 --- a/src/engine/stream_manager.h +++ b/src/engine/stream_manager.h @@ -77,7 +77,7 @@ RunContext StreamManager::GetRunContext( auto&& counter = gpu_cnt_.at(ctx.dev_id); if (counter == -1) { for (auto&& i : gpu_streams_.at(ctx.dev_id)) { - i = mshadow::NewStream(true, MXNET_USE_CUDNN != 0); + i = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, ctx.dev_id); } counter = 0; } @@ -108,7 +108,7 @@ RunContext StreamManager::GetIORunContext( { std::lock_guard lock{m_}; if (gpu_io_streams_.at(ctx.dev_id) == nullptr) { - gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream(false, false); + gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream(false, false, ctx.dev_id); } } ret = RunContext{ctx, gpu_io_streams_.at(ctx.dev_id)}; diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc index 66cfc9de1468..5cd8ca049dd3 100644 --- a/src/engine/threaded_engine_perdevice.cc +++ b/src/engine/threaded_engine_perdevice.cc @@ -183,9 +183,9 @@ class ThreadedEnginePerDevice : public ThreadedEngine { // allocate stream mshadow::SetDevice(ctx.dev_id); if (is_copy_worker) { - stream = mshadow::NewStream(false, false); + stream = mshadow::NewStream(false, false, ctx.dev_id); } else { - stream = mshadow::NewStream(true, MXNET_USE_CUDNN != 0); + stream = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, ctx.dev_id); } } while (false); // execute task diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h index d8f90df8447e..951affa208f8 100644 --- a/tests/cpp/include/test_op.h +++ b/tests/cpp/include/test_op.h @@ -75,7 +75,8 @@ class BasicOperatorData { : opContext_(*opContext) { CHECK_EQ(opContext_.run_ctx.stream == nullptr, true) << "Invalid runtime context stream state"; - opContext_.run_ctx.stream = mshadow::NewStream(true, true); + auto device_id = opContext->run_ctx.get_ctx().dev_id; + opContext_.run_ctx.stream = mshadow::NewStream(true, true, device_id); CHECK_EQ(opContext_.run_ctx.stream != nullptr, true) << "Unable to allocate a GPU stream"; } From d839abc09ce656205028e9f3df06067d87a2fc6c Mon Sep 17 00:00:00 2001 From: moin Date: Thu, 24 Aug 2017 19:06:01 +0200 Subject: [PATCH 024/448] fix for amalgamation build with MIN=1 (#7597) --- amalgamation/amalgamation.py | 4 ++++ src/operator/fully_connected-inl.h | 23 +++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py index 22b421d79fba..2aba8f4bdc77 100644 --- a/amalgamation/amalgamation.py +++ b/amalgamation/amalgamation.py @@ -32,6 +32,10 @@ minimum = int(sys.argv[6]) if len(sys.argv) > 5 else 0 android = int(sys.argv[7]) if len(sys.argv) > 6 else 0 +# blacklist linear algebra headers when building without blas. +if minimum != 0: + blacklist.append('linalg.h') + def pprint(lst): for item in lst: print item diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h index 6f0cf544d633..7120b5672f60 100644 --- a/src/operator/fully_connected-inl.h +++ b/src/operator/fully_connected-inl.h @@ -33,7 +33,9 @@ #include #include "./operator_common.h" #include "./elemwise_op_common.h" +#if (MSHADOW_USE_CBLAS != 0) #include "linalg.h" +#endif namespace mxnet { namespace op { @@ -108,9 +110,12 @@ class FullyConnectedOp : public Operator { Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); } - // Legacy approach shown here for comparison: - // out = dot(data, wmat.T()); +#if (MSHADOW_USE_CBLAS == 0) + // Legacy approach for amalgamation build w/out cblas + out = dot(data, wmat.T()); +#else linalg_gemm(data, wmat, out, false, true, s); +#endif if (!param_.no_bias) { Tensor bias = in_data[fullc::kBias].get(s); out += repmat(bias, data.size(0)); @@ -162,18 +167,24 @@ class FullyConnectedOp : public Operator { CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; // gradient of weight Tensor gwmat = in_grad[fullc::kWeight].get(s); - // Legacy approach shown here for comparison: - // Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); +#if (MSHADOW_USE_CBLAS == 0) + // Legacy approach for amalgamation build w/out cblas + Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); +#else linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); +#endif // gradient of bias if (!param_.no_bias) { Tensor gbias = in_grad[fullc::kBias].get(s); Assign(gbias, req[fullc::kBias], sum_rows(grad)); } // gradient of data - // Legacy approach shown here for comparison: - // Assign(gdata, req[fullc::kData], dot(grad, wmat)); +#if (MSHADOW_USE_CBLAS == 0) + // Legacy approach for amalgamation build w/out cblas + Assign(gdata, req[fullc::kData], dot(grad, wmat)); +#else linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); +#endif } private: From 9296907c8343ace2f7ed5cfef757849e63176382 Mon Sep 17 00:00:00 2001 From: reminisce Date: Thu, 24 Aug 2017 10:29:48 -0700 Subject: [PATCH 025/448] Fix import error of broadcast max, min, mod in ndarray.py and add unit tests (#7572) --- python/mxnet/ndarray/ndarray.py | 4 +- tests/python/unittest/test_operator.py | 132 +++++++++++++++++++++---- 2 files changed, 117 insertions(+), 19 deletions(-) diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py index 20ca2262f0cd..7322325722d6 100644 --- a/python/mxnet/ndarray/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -41,11 +41,11 @@ from . import broadcast_add, broadcast_mul, transpose, broadcast_not_equal, broadcast_power from . import broadcast_sub, broadcast_div, broadcast_to, broadcast_equal, cast_storage from . import broadcast_greater, broadcast_greater_equal, broadcast_lesser, broadcast_lesser_equal -from . import zeros_like, slice +from . import zeros_like, slice, broadcast_minimum, broadcast_maximum, broadcast_mod __all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP", "ones", "add", "arange", "divide", "equal", "full", "greater", "greater_equal", - "imdecode", "lesser", "lesser_equal", "maximum", "minimum", "moveaxis", + "imdecode", "lesser", "lesser_equal", "maximum", "minimum", "moveaxis", "modulo", "multiply", "negative", "not_equal", "onehot_encode", "power", "subtract", "true_divide", "waitall", "_new_empty_handle"] diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 11d0ea22319a..ceb11ed07c02 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -24,6 +24,7 @@ from numpy.testing import assert_allclose, assert_array_equal from mxnet.test_utils import * + def np_softmax(x, axis=-1): # fix for old numpy on Travis not supporting keepdims # x = x - np.max(x, axis=-1, keepdims=True) @@ -58,6 +59,7 @@ def check_elementwise_sum_with_shape(shape, n): for a in arr_grad: assert_almost_equal(a.asnumpy(), out_grad.asnumpy()) + def test_elementwise_sum(): np.random.seed(0) nrepeat = 2 @@ -112,6 +114,7 @@ def check_concat_with_shape(shapes, dimension, skip_second): np_grad = arr_np[i] assert_almost_equal(grad.asnumpy(), np_grad + 1) + def test_concat(): for dimension in range(4): n = 2 @@ -158,6 +161,7 @@ def test_concat(): check_concat_with_shape(shapes,dimension,True) check_concat_with_shape(shapes,dimension,False) + def test_slice_channel(): def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis): ins = [] @@ -221,6 +225,7 @@ def check_regression(symbol, forward, backward): npout = backward(npout, arr_label.asnumpy().reshape(npout.shape)) assert_almost_equal(npout, arr_grad.asnumpy()) + def test_regression(): check_regression(mx.symbol.LogisticRegressionOutput, lambda x: 1.0 / (1.0 + np.exp(-x)), @@ -229,6 +234,7 @@ def test_regression(): lambda x: x, lambda x, y : x - y) + def check_softmax_with_ignore_label(xpu): X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') @@ -261,6 +267,7 @@ def check_softmax_with_ignore_label(xpu): assert abs(np.sum(grad1[:int(shape[0]/2)])) < 1e-5 assert_almost_equal(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):]) + def check_softmax_with_shape(shape, xpu, preserve_shape=False): # bind with label X = mx.symbol.Variable('X') @@ -277,11 +284,13 @@ def check_softmax_with_shape(shape, xpu, preserve_shape=False): exec1.backward() assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(), rtol=1e-4) + def test_softmax(): check_softmax_with_shape((3, 4), default_context(), preserve_shape=False) check_softmax_with_shape((3, 4), default_context(), preserve_shape=True) check_softmax_with_shape((3, 4, 2), default_context(), preserve_shape=True) + def test_python_op(): X = mx.symbol.Variable('X') op = mx.operator.NumpyOp() @@ -296,6 +305,7 @@ def test_python_op(): exec1.backward(dy) assert_almost_equal(dy.asnumpy(), dx.asnumpy()) + def test_swapaxes(): data = mx.symbol.Variable('data') shape = (2, 3, 4) @@ -314,6 +324,7 @@ def test_swapaxes(): assert_almost_equal(out, swap_) + def test_scalarop(): data = mx.symbol.Variable('data') shape = (3, 4) @@ -344,6 +355,7 @@ def test_scalar_pow(): check_symbolic_forward(test, [data_tmp], [data_tmp ** 2]) check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [2 * data_tmp]) + def test_symbol_pow(): shape = (1, 1) @@ -362,6 +374,7 @@ def test_symbol_pow(): exp_dir = data_tmp**(exp_tmp) * np.log(data_tmp) check_symbolic_backward(test, [data_tmp, exp_tmp], [np.ones(shape)], [data_dir, exp_dir]) + def test_pow_fn(): shape = (3, 4) exp = mx.symbol.Variable("exp") @@ -371,6 +384,7 @@ def test_pow_fn(): check_symbolic_forward(y, [x], [2**x]) check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x]) + def test_relu(): def frelu(x): return np.maximum(x, 0.0) @@ -386,6 +400,7 @@ def frelu_grad(x): check_symbolic_forward(y, [xa], [ya]) check_symbolic_backward(y, [xa], [np.ones(shape)], [ga]) + def test_sigmoid(): def fsigmoid(a): return np.divide(1.0, (1.0 + np.exp(-a))) @@ -398,6 +413,7 @@ def fsigmoid(a): check_symbolic_forward(y, [xa], [ya]) check_symbolic_backward(y, [xa], [np.ones(shape)], [ya * (1 - ya)]) + def test_binary_logic(): def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True): x = mx.symbol.Variable("x") @@ -453,6 +469,7 @@ def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True): logic_sym=lambda x, y: mx.sym.broadcast_not_equal(x, y), x_shape=(1, 10), y_shape=(10, 1), test_scalar=False) + def test_embedding(): in_dim = 10 out_dim = 4 @@ -479,6 +496,7 @@ def test_embedding(): exe_test.backward([grad]) assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad)) + # check ops handle duplicate input correctly. def test_binary_op_duplicate_input(): data = mx.symbol.Variable('data') @@ -497,6 +515,7 @@ def test_binary_op_duplicate_input(): exe_square.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), 2.0 * data_tmp) + def test_sign(): data = mx.symbol.Variable('data') shape = (3, 4) @@ -520,6 +539,7 @@ def test_sign(): exe_test.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), npout_grad) + def test_round_ceil_floor(): data = mx.symbol.Variable('data') shape = (3, 4) @@ -536,6 +556,7 @@ def test_round_ceil_floor(): npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp) assert_almost_equal(out, npout) + def test_trunc(): data_tmp = np.random.rand(3, 4) * 10 - 5 arr_data = mx.nd.array(data_tmp) @@ -549,6 +570,7 @@ def test_trunc(): assert_almost_equal(out, npout) + def test_rsqrt_cos_sin(): data = mx.symbol.Variable('data') shape = (3, 4) @@ -572,6 +594,7 @@ def test_rsqrt_cos_sin(): exe_test.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), npout_grad) + def test_maximum_minimum(): data1 = mx.symbol.Variable('data') data2 = mx.symbol.Variable('data') @@ -584,11 +607,9 @@ def test_maximum_minimum(): arr_data1 = mx.nd.array(data_tmp1) arr_data2 = mx.nd.array(data_tmp2) - arr_grad1 = mx.nd.empty(shape) arr_grad2 = mx.nd.empty(shape) - test = mx.sym.maximum(data1,data2) + mx.sym.minimum(data1,data2); exe_test = test.bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2]) exe_test.forward(is_train=True) @@ -610,6 +631,7 @@ def test_maximum_minimum(): assert_almost_equal(arr_grad1.asnumpy(), npout_grad1) assert_almost_equal(arr_grad2.asnumpy(), npout_grad2) + def test_maximum_minimum_scalar(): data1 = mx.symbol.Variable('data') shape = (3, 4) @@ -640,6 +662,7 @@ def test_maximum_minimum_scalar(): assert_almost_equal(arr_grad1.asnumpy(), npout_grad1) + def test_abs(): data = mx.symbol.Variable('data') shape = (3, 4) @@ -663,6 +686,7 @@ def test_abs(): exe_test.backward(out_grad) assert_almost_equal(arr_grad.asnumpy(), npout_grad) + def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride, pad): """configure A: input --> conv --> deconv --> output. the convolution and deconvoluiton has similar parameter which ensure @@ -761,6 +785,7 @@ def check_deconvolution_gradient(input_shape, num_filter, pad): assert_almost_equal(conv_args_grad[1].asnumpy() + deconv_addto_args_grad_npy[1], deconv_addto_args_grad[1].asnumpy(), rtol=1e-3, atol=1e-2) + def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, target_shape=None): data = mx.sym.Variable(name="data") if target_shape: @@ -774,6 +799,7 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape) assert out_shapes[0] == (input_shape[0], 5, 8, 8) + def test_deconvolution(): check_deconvolution_target_shape( input_shape = (2,3,4,4), @@ -822,6 +848,7 @@ def test_deconvolution(): pad = (3,3) ) + def check_nearest_upsampling_with_shape(shapes, scale, root_scale): arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)} arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)} @@ -834,6 +861,7 @@ def check_nearest_upsampling_with_shape(shapes, scale, root_scale): name = 'arg_%d'%k assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4) + def check_bilinear_upsampling_with_shape(shapes, scale, root_scale): arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)} arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)} @@ -846,6 +874,7 @@ def check_bilinear_upsampling_with_shape(shapes, scale, root_scale): name = 'arg_%d'%k assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4) + def test_nearest_upsampling(): for root_scale in [1,2,3]: for scale in [1,2,3]: @@ -854,6 +883,7 @@ def test_nearest_upsampling(): shapes = [(1,3,base*root_scale*scale**(num_shape-1-i),base*root_scale*scale**(num_shape-1-i)) for i in range(num_shape)] check_nearest_upsampling_with_shape(shapes, scale, root_scale) + def test_batchnorm_training(): def check_batchnorm_training(stype): for shape in [(2, 3), (2, 3, 2, 2)]: @@ -938,6 +968,7 @@ def check_batchnorm_training(stype): for stype in stypes: check_batchnorm_training(stype) + def test_convolution_grouping(): num_filter = 4 num_group = 2 @@ -1006,6 +1037,7 @@ def test_depthwise_convolution(): for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) + def gen_broadcast_data(idx): # Manually set test cases binary_op_data_shape = np.array( @@ -1061,27 +1093,35 @@ def gen_broadcast_data(idx): r_shape[np.where(r_axis_flags == 0)] = 1 return [np.random.random(l_shape), np.random.random(r_shape)] + def gen_broadcast_data_int(idx): d = gen_broadcast_data(idx); return [np.round(d[0]*100).astype(int), np.round(d[1]*100).astype(int)] + def gen_binary_data(dummy): ndim = np.random.randint(1, 6) shape = np.random.randint(1, 6, size=(ndim,)) return [np.random.random(shape), np.random.random(shape)] + def gen_binary_data_int(dummy): d = gen_binary_data(dummy); return [np.round(d[0]*100).astype(int), np.round(d[1]*100).astype(int)] -def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5): + +def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5, mx_nd_func=None): sample_num = 200 for i in range(sample_num): d = gen_data(i) x = baseline(d[0], d[1]) - y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])}) + y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])}) y.forward(is_train=True) y = y.outputs[0].asnumpy() + if mx_nd_func is not None: + d0 = mx.nd.array(d[0], dtype=d[0].dtype) + d1 = mx.nd.array(d[1], dtype=d[1].dtype) + assert_almost_equal(y, mx_nd_func(d0, d1).asnumpy(), rtol=rtol, atol=atol) idx = np.abs(x-y) > atol+rtol*np.abs(x) if idx.any(): print('found precision problem') @@ -1097,11 +1137,13 @@ def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5): print('diff: {}'.format(np.abs(x-y)[idx] - atol-rtol*np.abs(x)[idx])) assert_allclose(y, x, rtol=rtol, atol=atol) + def check_binary_op_backward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5): sample_num = 200 for i in range(sample_num): d = gen_data(i) out = np.random.random((d[0] + d[1]).shape) + def reduce_op(shape, x): if shape == x.shape: return x @@ -1111,18 +1153,20 @@ def reduce_op(shape, x): keepdims_shape[i] = 1 x = np.sum(x, axis=i).reshape(keepdims_shape) return x + baseline_grad1, baseline_grad2 = baseline(out, d[0], d[1]) x_1 = reduce_op(d[0].shape, baseline_grad1) x_2 = reduce_op(d[1].shape, baseline_grad2) y_1 = mx.nd.empty(d[0].shape) y_2 = mx.nd.empty(d[1].shape) - y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])}, + y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])}, args_grad=[y_1, y_2]) y.forward(is_train=True) y.backward([mx.nd.array(out)]) assert_allclose(y_1.asnumpy(), x_1, rtol=rtol, atol=atol) assert_allclose(y_2.asnumpy(), x_2, rtol=rtol, atol=atol) + def test_binary_op(): a = mx.sym.Variable('a') b = mx.sym.Variable('b') @@ -1177,51 +1221,65 @@ def test_bneq(a, b): test_bpow(a, b) test_bneq(a, b) + def test_broadcast_binary_op(): a = mx.sym.Variable('a') b = mx.sym.Variable('b') def test_bplus(a, b): c = mx.sym.broadcast_plus(a, b) - check_binary_op_forward(c, lambda a, b: a + b, gen_broadcast_data) + check_binary_op_forward(c, lambda a, b: a + b, gen_broadcast_data, mx_nd_func=mx.nd.add) check_binary_op_backward(c, lambda g_out, a, b: (g_out, g_out), gen_broadcast_data) def test_bminus(a, b): c = mx.sym.broadcast_minus(a, b) - check_binary_op_forward(c, lambda a, b: a - b, gen_broadcast_data) + check_binary_op_forward(c, lambda a, b: a - b, gen_broadcast_data, mx_nd_func=mx.nd.subtract) check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out), gen_broadcast_data) def test_bmul(a, b): c = mx.sym.broadcast_mul(a, b) - check_binary_op_forward(c, lambda a, b: a * b, gen_broadcast_data) + check_binary_op_forward(c, lambda a, b: a * b, gen_broadcast_data, mx_nd_func=mx.nd.multiply) check_binary_op_backward(c, lambda g_out, a, b: (g_out * b, g_out * a), gen_broadcast_data) def test_bdiv(a, b): c = mx.sym.broadcast_div(a, b) - check_binary_op_forward(c, lambda a, b: a / b, gen_broadcast_data) + check_binary_op_forward(c, lambda a, b: a / b, gen_broadcast_data, mx_nd_func=mx.nd.divide) check_binary_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)), gen_broadcast_data) def test_bmod(a, b): c = mx.sym.broadcast_mod(a, b) - check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data, atol=1) + check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data, atol=1, mx_nd_func=mx.nd.modulo) check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out * (a // b)), gen_broadcast_data, atol=1) def test_bmod_int(a, b): c = mx.sym.broadcast_mod(mx.sym.cast(a, dtype='int32'), mx.sym.cast(b, dtype='int32')) - check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data_int) + check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data_int, mx_nd_func=mx.nd.modulo) check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int) def test_bpow(a, b): c = mx.sym.broadcast_power(a, b) - check_binary_op_forward(c, lambda a, b: a ** b, gen_broadcast_data) + check_binary_op_forward(c, lambda a, b: a ** b, gen_broadcast_data, mx_nd_func=mx.nd.power) check_binary_op_backward(c, lambda g_out, a, b: (g_out * a **(b - 1) * b, g_out * a ** b * np.log(a)), gen_broadcast_data) def test_bequal(a, b): c = mx.sym.broadcast_equal(a, b) - check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data_int) + check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data_int, + mx_nd_func=mx.nd.equal) check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int) + def test_bmax(a, b): + c = mx.sym.broadcast_maximum(a, b) + check_binary_op_forward(c, lambda x, y: np.maximum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.maximum) + # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big + check_numeric_gradient(c, gen_broadcast_data(idx=200), rtol=1e-2, atol=1e-3) + + def test_bmin(a, b): + c = mx.sym.broadcast_minimum(a, b) + check_binary_op_forward(c, lambda x, y: np.minimum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.minimum) + # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big + check_numeric_gradient(c, gen_broadcast_data(idx=200), rtol=1e-2, atol=1e-3) + test_bplus(a, b) test_bminus(a, b) test_bmul(a, b) @@ -1230,6 +1288,9 @@ def test_bequal(a, b): test_bmod_int(a, b) test_bpow(a, b) test_bequal(a, b) + test_bmax(a, b) + test_bmin(a, b) + def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False): # Input for spike response @@ -1238,7 +1299,6 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), spike_img = mx.nd.array(spike_imgs) spike_img2 = mx.nd.array(spike_imgs) - kernel_weights = mx.nd.ones(shape=tuple([1,1]+list(kernel_shape)), dtype=np.float32) kernel_weights2 = mx.nd.ones(shape=tuple([1,1]+list(kernel_shape)), dtype=np.float32) @@ -1374,6 +1434,7 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape): exe.backward(out_grads=[mx.nd.array(out_grad_npy, ctx=default_context())]) assert_allclose(exe.grad_arrays[0].asnumpy(), out_grad_npy.reshape((5, 4, 3, 7))) + def test_reduce(): sample_num = 500 def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym, nan_prob = 0): @@ -1490,6 +1551,7 @@ def test_broadcasting_ele(sym_bcast): test_broadcasting_ele(sym_bcast_axis) test_broadcasting_ele(sym_bcast_to) + def test_transpose(): for ndim in range(1, 7): for t in range(5): @@ -1589,6 +1651,7 @@ def test_slice_axis(): xx[idx] = x.asnumpy()[idx] assert_allclose(xx + x_grad_npy, xgrad.asnumpy(), atol=1E-5) + def test_flip(): for ndim in range(1, 6): for t in range(5): @@ -1677,18 +1740,22 @@ def dot_sym(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y) + def dot_sym_xT(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y, transpose_a=True) + def dot_sym_yT(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y, transpose_b=True) + def dot_sym_xT_yT(data_type): x = mx.sym.Variable('x', dtype=data_type) y = mx.sym.Variable('y', dtype=data_type) return mx.sym.dot(x, y, transpose_a=True, transpose_b=True) + for data_type in dtypes: for ashape, bshape in [((3, 4), (4, 5)), ((2, 3, 4), (4, 5, 6))]: m1_npy = np.random.uniform(-1, 1, ashape) @@ -1700,6 +1767,7 @@ def dot_sym_xT_yT(data_type): check_numeric_gradient(dot_sym_yT(data_type), [m1_npy, m2_npy.T], numeric_eps=1e-1, rtol=2e-2, atol=1e-3) check_numeric_gradient(dot_sym_xT_yT(data_type), [m1_npy.T, m2_npy.T], numeric_eps=1e-1, rtol=2e-2, atol=1e-3) + def test_batch_dot(): dtypes = ['float32', 'float64'] @@ -1756,6 +1824,7 @@ def test_batch_dot(): assert_almost_equal(exe_add.grad_dict['b'].asnumpy(), bgrad_npy + b_init_grad_npy, rtol=1e-3, atol=1e-4) + def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply): img1 = mx.sym.Variable('img1') @@ -1763,6 +1832,7 @@ def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad return mx.sym.Correlation(data1=img1,data2=img2,kernel_size =kernel_size,max_displacement = max_displacement, stride1 = stride1,stride2 = stride2,pad_size= pad_size,is_multiply = is_multiply) + def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply): # compute output's dimension @@ -1810,6 +1880,7 @@ def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_dis out /= float(kernel_size**2*data1.shape[1]) return out,tmp1,tmp2 + def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply): # compute output's dimension @@ -1859,6 +1930,7 @@ def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,str tmp2_grad = tmp2_grad / float(kernel_size**2*data1.shape[1]) return tmp1_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],tmp2_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]], + def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply): img1 = np.random.random(data_shape) @@ -1891,8 +1963,8 @@ def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2 assert_almost_equal(exe1.grad_dict['img1'].asnumpy(), grad1, rtol=1e-3, atol=1e-4) assert_almost_equal(exe1.grad_dict['img2'].asnumpy(), grad2, rtol=1e-3, atol=1e-4) -def test_correlation(): +def test_correlation(): unittest_correlation((1,3,10,10), kernel_size = 1,max_displacement = 4,stride1 = 1,stride2 = 1,pad_size = 4,is_multiply = False) unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = False) unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = True) @@ -1932,6 +2004,7 @@ def test_support_vector_machine_l1_svm(): assert_almost_equal(grad_np, grad.asnumpy()) + def test_support_vector_machine_l2_svm(): xpu = default_context() shape = (20, 10) @@ -1979,6 +2052,7 @@ def test_roipooling(): grad_nodes={'data':'add', 'rois':'null'}, numeric_eps=1e-4, rtol=1e-1, atol=1E-4) + def check_pad_with_shape(shape, xpu, pad_width, mode): # bind with label X = mx.symbol.Variable('X') @@ -1997,6 +2071,7 @@ def check_pad_with_shape(shape, xpu, pad_width, mode): # grad check check_numeric_gradient(Y, [x.asnumpy()], numeric_eps=1e-2, rtol=1e-2) + def test_pad(): shape1 = (2, 3, 3, 5) pad1 = (0, 0, 0, 0, 1, 2, 3, 4) @@ -2009,6 +2084,7 @@ def test_pad(): check_pad_with_shape(shape1, default_context(), pad1, 'reflect') check_pad_with_shape(shape2, default_context(), pad2, 'reflect') + def np_instance_norm(data, weight, bias, eps): spatial_dims = data.shape[2::] num_spatial_vals = np.prod(np.array(spatial_dims)) @@ -2025,6 +2101,7 @@ def np_instance_norm(data, weight, bias, eps): biasBatch = np.reshape(np.repeat(biasBatch, num_spatial_vals), data.shape) return weightBatch * (data - mean)/np.sqrt(var + eps) + biasBatch + def check_instance_norm_with_shape(shape, xpu): # bind with label eps = 0.001 @@ -2045,12 +2122,14 @@ def check_instance_norm_with_shape(shape, xpu): check_numeric_gradient(Y, {'X':x.asnumpy(), 'G':gamma.asnumpy(), 'B':beta.asnumpy()}, numeric_eps=1e-2, rtol=1e-2, atol=1e-2) + def test_instance_normalization(): check_instance_norm_with_shape((1, 1, 1), default_context()) check_instance_norm_with_shape((2, 1, 2), default_context()) check_instance_norm_with_shape((2,4,5,6), default_context()) check_instance_norm_with_shape((3,3,2,3,2,1,1), default_context()) + def check_l2_normalization(in_shape, mode, ctx=default_context(), norm_eps=1e-10): data = mx.symbol.Variable('data') out = mx.symbol.L2Normalization(data=data, mode=mode, eps=norm_eps) @@ -2084,6 +2163,7 @@ def check_l2_normalization(in_shape, mode, ctx=default_context(), norm_eps=1e-10 # check gradient check_numeric_gradient(out, [in_data], numeric_eps=1e-3, rtol=1e-2, atol=1e-3) + def test_l2_normalization(): for mode in ['channel', 'spatial', 'instance']: for nbatch in [1, 4]: @@ -2093,6 +2173,7 @@ def test_l2_normalization(): for width in [5, 7]: check_l2_normalization((nbatch, nchannel, height, width), mode) + def sequence_mask_numpy(array, lengths, value): arrayMask = array.copy() shape = array.shape @@ -2101,6 +2182,7 @@ def sequence_mask_numpy(array, lengths, value): arrayMask[int(lengths[i]):, i] = value return arrayMask + def check_sequence_mask(shape, xpu, mask_value): # bind with label X = mx.symbol.Variable('X') @@ -2123,12 +2205,14 @@ def check_sequence_mask(shape, xpu, mask_value): check_numeric_gradient(Y, [x.asnumpy(), l.asnumpy()], grad_nodes={'X':'write'}, numeric_eps=1e-3, rtol=1e-2) + def test_sequence_mask(): shape1 = (4, 2, 2, 3) shape2 = (1, 2, 2, 3, 1, 1) check_sequence_mask(shape1, default_context(), 2.1) check_sequence_mask(shape2, default_context(), 0.1) + def check_sequence_reverse(xpu): # sample data @@ -2192,6 +2276,7 @@ def test_wrapper(arr, xpu, sequence_length=None, use_sequence_length=False): def test_sequence_reverse(): check_sequence_reverse(mx.cpu()) + def mathematical_core_binary(name, forward_mxnet_call, forward_numpy_call, @@ -2236,6 +2321,7 @@ def mathematical_core_binary(name, assert_almost_equal(arr_grad1, npout_grad1) assert_almost_equal(arr_grad2, npout_grad2) + def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_numpy_call, data_init=5., grad_init=2.): data = mx.symbol.Variable('data') shape = (3, 4) @@ -2264,6 +2350,7 @@ def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_num # print(npout_grad) assert_almost_equal(arr_grad, npout_grad) + def test_special_functions_using_scipy(): try: from scipy import special as scipy_special @@ -2294,6 +2381,7 @@ def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_in npout = forward_numpy_call(data_tmp) assert_almost_equal(out, npout) + def test_mathematical(): # rsqrt mathematical_core("rsqrt", @@ -2380,6 +2468,7 @@ def test_mathematical(): # fix rounding("fix", lambda x: mx.sym.fix(x), lambda x: np.fix(x)) + def test_special_functions_using_scipy(): try: from scipy import special as scipy_special @@ -2395,6 +2484,7 @@ def test_special_functions_using_scipy(): mathematical_core("gammaln", lambda x: mx.sym.gammaln(x), lambda x: scipy_special.gammaln(x), lambda x: scipy_special.psi(x), 0.5, 0.5) + def test_clip(): data = mx.symbol.Variable('data') shape = (30, 30) @@ -2404,6 +2494,7 @@ def test_clip(): check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [np.where(data_tmp < 0.6, [1], [0]) * np.where(data_tmp > -0.6, [1], [0])]) + def test_init(): def test_basic_val_init(sym_func, np_func, shape, dtype): x = sym_func(shape=shape, dtype=dtype) @@ -2411,6 +2502,7 @@ def test_basic_val_init(sym_func, np_func, shape, dtype): exe.forward(is_train=True) assert_almost_equal(exe.outputs[0].asnumpy(), np_func(shape=shape, dtype=dtype)) assert exe.outputs[0].asnumpy().dtype == dtype + def test_arange(): for i in range(5): start = np.random.rand() * 10 @@ -2432,6 +2524,7 @@ def test_arange(): def test_order(): ctx = default_context() + def gt_topk(dat, axis, ret_typ, k, is_ascend): if ret_typ == "indices": if is_ascend: @@ -2538,6 +2631,7 @@ def test_blockgrad(): assert_almost_equal(exe.outputs[0].asnumpy(), a_npy) exe.backward() # No error if BlockGrad works + def test_take(): def check_output_n_grad(data_shape, idx_shape): exe = result.simple_bind(default_context(), a=data_shape, @@ -2691,7 +2785,6 @@ def bilinear_forward_numpy(data, grid): +(1-xWeightTopLeft) * (1-yWeightTopLeft) * inBottomRight return out - def bilinear_backward_numpy(out_grad, data, grid): data_grad = np.zeros(data.shape, dtype=np.float32) @@ -2802,6 +2895,7 @@ def bilinear_backward_numpy(out_grad, data, grid): assert_almost_equal(exe_addto.grad_dict['data'].asnumpy(), data_grad + data_initial_grid, rtol=1e-3,atol=1e-5) assert_almost_equal(exe_addto.grad_dict['grid'].asnumpy(), grid_grad + grid_initial_grid, rtol=1e-3,atol=1e-5) + def test_index2d(): for _ in range(30): n = np.random.randint(1, 100) @@ -2811,6 +2905,7 @@ def test_index2d(): r = mx.nd.batch_take(data, x) assert_almost_equal(r.asnumpy(), data.asnumpy()[np.arange(n), x.asnumpy()]) + def test_cast(): for srctype in [np.int32, np.float32, np.float16]: for dsttype in [np.float32, np.int32, np.float16]: @@ -3277,6 +3372,7 @@ def check_ctc_loss(acts, labels, loss_truth): # test grad check_numeric_gradient(ctc, [acts, labels], grad_nodes=['input'], rtol=0.05, atol=1e-3) + def test_ctc_loss(): # Test 1: check that batches are same + check against Torch WarpCTC acts = np.array([ @@ -3310,6 +3406,7 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) + def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors @@ -3320,6 +3417,7 @@ def test_reciprocal_op(): check_numeric_gradient(test, [data_tmp]) check_symbolic_forward(test, [data_tmp], [np.reciprocal(data_tmp)]) + def test_custom_op(): class Sqr(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): @@ -3391,6 +3489,7 @@ def test_psroipooling(): check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol, grad_nodes=grad_nodes, ctx=mx.gpu(0)) + def test_deformable_convolution(): for num_batch in [1, 2]: for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]): @@ -3461,7 +3560,6 @@ def test_deformable_psroipooling(): grad_nodes=grad_nodes, ctx=mx.gpu(0)) - def test_laop(): # enable numerical checking of gradients From b0b46641a3d3b58f7b38d8bf84b9bf0c392f3873 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Thu, 24 Aug 2017 10:43:52 -0700 Subject: [PATCH 026/448] add ctx to begin_state in rnn_layer (#7580) * add ctx to begin_state * fix image classification --- example/gluon/image_classification.py | 8 ++++++-- python/mxnet/gluon/rnn/rnn_layer.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py index 3f84ff8602ed..b8d018d3098a 100644 --- a/example/gluon/image_classification.py +++ b/example/gluon/image_classification.py @@ -57,6 +57,8 @@ help='enable batch normalization or not in vgg. default is false.') parser.add_argument('--use-pretrained', action='store_true', help='enable using pretrained model from gluon.') +parser.add_argument('--kvstore', type=str, default='device', + help='kvstore to use for trainer/module.') parser.add_argument('--log-interval', type=int, default=50, help='Number of batches to wait before logging.') opt = parser.parse_args() @@ -116,7 +118,8 @@ def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) - trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd}) + trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd}, + kvstore = opt.kvstore) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() @@ -162,7 +165,8 @@ def train(epochs, ctx): out = net(data) softmax = mx.sym.SoftmaxOutput(out, name='softmax') mod = mx.mod.Module(softmax, context=[mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()]) - mod.fit(train_data, num_epoch=opt.epochs, batch_end_callback = mx.callback.Speedometer(batch_size, 1)) + mod.fit(train_data, num_epoch=opt.epochs, kvstore=opt.kvstore, + batch_end_callback = mx.callback.Speedometer(batch_size, 1)) else: if opt.mode == 'hybrid': net.hybridize() diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py index 86b7c618e503..063d56654f9f 100644 --- a/python/mxnet/gluon/rnn/rnn_layer.py +++ b/python/mxnet/gluon/rnn/rnn_layer.py @@ -141,7 +141,7 @@ def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs): batch_size: int Only required for `NDArray` API. Size of the batch ('N' in layout). Dimension of the input. - func : callable, default `symbol.zeros` + func : callable, default `ndarray.zeros` Function for creating initial state. For Symbol API, func can be `symbol.zeros`, `symbol.uniform`, @@ -172,7 +172,7 @@ def forward(self, inputs, states=None): batch_size = inputs.shape[self._layout.find('N')] skip_states = states is None if skip_states: - states = self.begin_state(batch_size) + states = self.begin_state(batch_size, ctx=inputs.context) if isinstance(states, ndarray.NDArray): states = [states] for state, info in zip(states, self.state_info(batch_size)): From f489810e0243aec05bc5107e94a9742cf55e1a1c Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Thu, 24 Aug 2017 12:02:41 -0700 Subject: [PATCH 027/448] contrib ctc interface changes, cudnn7 CTC, and gluon CTC (#7442) * contrib ctc interface changes for compatibility * cudnn ctc * update per comments --- python/mxnet/gluon/loss.py | 90 +++++++ src/operator/contrib/ctc_loss-inl.h | 331 ++++++++++++++++++++++---- src/operator/contrib/ctc_loss.cc | 12 +- src/operator/sequence_op_common.h | 18 +- tests/python/gpu/test_operator_gpu.py | 1 + tests/python/unittest/test_loss.py | 30 +++ 6 files changed, 430 insertions(+), 52 deletions(-) diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py index 583910590868..bb45e8926e95 100644 --- a/python/mxnet/gluon/loss.py +++ b/python/mxnet/gluon/loss.py @@ -21,6 +21,8 @@ from __future__ import absolute_import from .. import ndarray +from ..contrib import symbol as symbol_contrib +from ..contrib import ndarray as ndarray_contrib from ..base import numeric_types from .block import HybridBlock @@ -295,3 +297,91 @@ def hybrid_forward(self, F, output, label, sample_weight=None): loss = label * (F.log(label+1e-8) - output) loss = _apply_weighting(F, loss, self._weight, sample_weight) return F.mean(loss, axis=self._batch_axis, exclude=True) + + +class CTCLoss(Loss): + r"""Connectionist Temporal Classification Loss. + + See `"Connectionist Temporal Classification: Labelling Unsegmented + Sequence Data with Recurrent Neural Networks" + `_ paper for more information. + + Parameters + ---------- + layout : str, default 'NTC' + Layout of the output sequence activation vector. + label_layout : str, default 'NT' + Layout of the labels. + padding_mask : int or None, default -1 + This is the label value to be considered padding, which is used to derive the actual + lengths of labels. Only required when `label_lengths` is None. + weight : float or None + Global scalar weight for loss. + sample_weight : Symbol or None + Per sample weighting. Must be broadcastable to + the same shape as loss. For example, if loss has + shape (64, 10) and you want to weight each sample + in the batch, `sample_weight` should have shape (64, 1). + This should be used as the fifth argument when calling this loss. + + Input shapes: + `data` is an activation tensor without softmax. + Its shape depends on `layout`. For `layout='TNC'`, this + input has shape `(sequence_length, batch_size, alphabet_size)` + + `label` is the label index matrix. + Its shape depends on `label_layout`. For `label_layout='TN'`, this + input has shape `(label_sequence_length, batch_size)` + When `label_lengths` is not specified, the first occurrence of `padding_mask` + in each sample marks the end of the label sequence of that sample. + For example, suppose there are two samples, with *label_sequence_length* = 4. + The two sequences of labels are [2, 1] and [3, 2, 2], and their actual lengths + are smaller than 4. Thus, given *padding_mask* = 0, the resulting ```label``` + tensor should be padded to be:: + + [[2, 1, 0, 0], [3, 2, 2, 0]] + + `data_lengths` is optional and defaults to None. + When specified, it represents the actual lengths of data. + The shape should be (batch_size,). + If None, the data lengths are treated as being equal to the max sequence length. + This should be used as the third argument when calling this loss. + + `label_lengths` is optional and defaults to None. + When specified, it represents the actual lengths of labels. + The shape should be (batch_size,). + If None, the label lengths are derived from the first occurrence of + the value specified by `padding_mask`. + This should be used as the fourth argument when calling this loss. + + Output shape: + The CTC loss output has the shape (batch_size,). + """ + def __init__(self, layout='NTC', label_layout='NT', padding_mask=-1, + weight=None, **kwargs): + assert layout in ['NTC', 'TNC'],\ + "Only 'NTC' and 'TNC' layouts for output are supported. Got: %s"%layout + assert label_layout in ['NT', 'TN'],\ + "Only 'NT' and 'TN' layouts for label are supported. Got: %s"%label_layout + self._layout = layout + self._label_layout = label_layout + self._padding_mask = padding_mask + batch_axis = label_layout.find('N') + super(CTCLoss, self).__init__(weight, batch_axis, **kwargs) + + def hybrid_forward(self, F, data, label, + data_lengths=None, label_lengths=None, sample_weight=None): + if self._layout == 'NTC': + data = F.swapaxes(data, 0, 1) + if self._batch_axis == 1: + label = F.swapaxes(label, 0, 1) + if F is ndarray: + F_contrib = ndarray_contrib + else: + F_contrib = symbol_contrib + loss = F_contrib.CTCLoss(data, label, + use_data_lengths=data_lengths is not None, + use_label_lengths=label_lengths is not None, + data_lengths=data_lengths, label_lengths=label_lengths, + padding_mask=self._padding_mask) + return _apply_weighting(F, loss, self._weight, sample_weight) diff --git a/src/operator/contrib/ctc_loss-inl.h b/src/operator/contrib/ctc_loss-inl.h index 0d0c0bf4cd09..13ce1f240afd 100644 --- a/src/operator/contrib/ctc_loss-inl.h +++ b/src/operator/contrib/ctc_loss-inl.h @@ -41,6 +41,11 @@ #include "../sequence_op_common.h" #include "../mshadow_op.h" +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 +#define CUDNN_LABEL_LENGTH_LIMIT 256 +#include "../nn/softmax-inl.h" +#endif + namespace mxnet { namespace op { @@ -52,14 +57,14 @@ enum CTCLossOpForwardResource { kTempSpace }; template inline void get_workspace_size(std::vector *label_lengths, - std::vector *input_lengths, + std::vector *data_lengths, int alphabet_size, int minibatch, bool gpu, size_t *size_bytes) { // This is the max of all S and T for all examples in the minibatch. int maxL = *std::max_element(label_lengths->data(), label_lengths->data() + minibatch); - int maxT = *std::max_element(input_lengths->data(), - input_lengths->data() + minibatch); + int maxT = *std::max_element(data_lengths->data(), + data_lengths->data() + minibatch); const int S = 2 * maxL + 1; @@ -125,34 +130,109 @@ inline void get_workspace_size(std::vector *label_lengths, } // Takes a tensor of labels, and interprets 0-elements at the end of the vector -// as padding. The tensor is packed into a std::vector without padding -// characters. The sequence lengths are also inferred from the padding chars +// as padding. The tensor is packed into an std::vector without padding +// characters. The label sequence lengths are also inferred from the padding chars. +// When cudnn is enabled, the return value signifies whether the cudnn length limit is exceeded. template -inline void LabelTensorToPackedVector(mshadow::Tensor labels, +inline bool LabelTensorToPackedVector(mshadow::Tensor labels, + int padding_mask, std::vector *packed_labels, std::vector *label_lengths) { int batch = labels.size(0); int max_num_labels = labels.size(1); - std::vector cpu_labels(max_num_labels); + bool exceed_limit = false; + + std::vector cpu_labels(max_num_labels*batch); + mshadow::Tensor flat_labels = labels.FlatTo1D(); + IndexTensorToVector(flat_labels, &cpu_labels); + + for (int b = 0; b < batch; ++b) { + auto start = cpu_labels.data()+b*max_num_labels; + auto res = std::find(start, start+max_num_labels, padding_mask); + int len = std::distance(start, res); +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + exceed_limit = exceed_limit || len > CUDNN_LABEL_LENGTH_LIMIT; +#endif + std::copy(start, start + len, + std::back_inserter(*packed_labels)); + label_lengths->at(b) = len; + } + return exceed_limit; +} + +// Takes a tensor of labels, and a vector which specifies the actual length of each label +// The tensor is packed into an std::vector without padding characters. +// The label length vector is copied into an std::vector. +// When cudnn is enabled, the return value signifies whether the cudnn length limit is exceeded. +template +inline bool PackLabelByLength(mshadow::Tensor labels, + mshadow::Tensor in_label_lengths, + std::vector *packed_labels, + std::vector *label_lengths) { + int batch = labels.size(0); + int max_num_labels = labels.size(1); + bool exceed_limit = false; + + IndexTensorToVector(in_label_lengths, label_lengths); + + std::vector cpu_labels(max_num_labels*batch); + mshadow::Tensor flat_labels = labels.FlatTo1D(); + IndexTensorToVector(flat_labels, &cpu_labels); for (int b = 0; b < batch; ++b) { - IndexTensorToVector(labels[b], &cpu_labels); - auto res = std::find(cpu_labels.begin(), cpu_labels.end(), 0); - int len = std::distance(cpu_labels.begin(), res); - std::copy(cpu_labels.begin(), cpu_labels.begin() + len, + auto start = cpu_labels.data()+b*max_num_labels; + int len = label_lengths->at(b); +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + exceed_limit = exceed_limit || len > CUDNN_LABEL_LENGTH_LIMIT; +#endif + std::copy(start, start + len, std::back_inserter(*packed_labels)); - label_lengths->emplace_back(len); } + return exceed_limit; } struct CTCLossParam : public dmlc::Parameter { - DMLC_DECLARE_PARAMETER(CTCLossParam) {} + bool use_data_lengths; + bool use_label_lengths; + dmlc::optional padding_mask; + DMLC_DECLARE_PARAMETER(CTCLossParam) { + DMLC_DECLARE_FIELD(use_data_lengths).set_default(false) + .describe("Whether the data lenghts are decided by `data_lengths`. " + "If false, the lengths are equal to the max sequence length."); + DMLC_DECLARE_FIELD(use_label_lengths).set_default(false) + .describe("Whether the label lenghts are decided by " + "`label_lengths`, or derived from `padding_mask`. " + "If false, the lengths are derived from the " + "first occurrence of the value of `padding_mask`."); + DMLC_DECLARE_FIELD(padding_mask).set_default(dmlc::optional(0)) + .describe("int or None. This is the label value to be considered padding. " + "Only required when `use_label_lengths` is false. " + "Labels before the first occurrence of `padding_mask` are included " + "in calculation."); + } }; template class CTCLossOp : public Operator { public: - explicit CTCLossOp(CTCLossParam p) { this->param_ = p; } + explicit CTCLossOp(CTCLossParam p) { + this->param_ = p; + exceed_cudnn_limit = false; +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + CUDNN_CALL(cudnnCreateCTCLossDescriptor(&ctc_desc_)); + CUDNN_CALL(cudnnSetCTCLossDescriptor(ctc_desc_, CUDNN_DATA_FLOAT)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&prob_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&grad_desc_)); +#endif + } + + ~CTCLossOp() { +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + CUDNN_CALL(cudnnDestroyCTCLossDescriptor(ctc_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(prob_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(grad_desc_)); +#endif + } virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, @@ -160,8 +240,9 @@ class CTCLossOp : public Operator { const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_data.size(), 2U+param_.use_data_lengths+param_.use_label_lengths); CHECK_EQ(out_data.size(), 2U); + exceed_cudnn_limit = false; Stream *s = ctx.get_stream(); Tensor data = @@ -178,27 +259,41 @@ class CTCLossOp : public Operator { int batch_size = data.size(1); int alphabet_size = data.size(2); + // data_lengths + std::vector data_lengths(batch_size, max_seq_len); + if (param_.use_data_lengths) { + int kInputLength = 2; + IndexTensorToVector(in_data[kInputLength].get(s), &data_lengths); + } + // label_lengths std::vector packed_labels; - std::vector label_lengths; - LabelTensorToPackedVector(labels, &packed_labels, &label_lengths); - - // allocate temporary workspace - std::vector input_lengths(batch_size, max_seq_len); - size_t size_bytes; - bool gpu = data.kDevCPU ? false : true; - get_workspace_size(&label_lengths, &input_lengths, alphabet_size, - batch_size, gpu, &size_bytes); - - // round-up so there are enough elems in memory - int num_tmp_elems = (size_bytes + sizeof(real_t) - 1) / sizeof(real_t); - Tensor workspace = - ctx.requested[ctc_loss::kTempSpace].get_space_typed( - Shape1(num_tmp_elems), s); - - compute_ctc_cost(data, costs.dptr_, grad.dptr_, packed_labels.data(), - label_lengths.data(), input_lengths.data(), - workspace.dptr_, ctx.is_train); + std::vector label_lengths(batch_size); + + if (param_.use_label_lengths) { + int kLabelLength = 2+param_.use_data_lengths; + exceed_cudnn_limit = PackLabelByLength(labels, in_data[kLabelLength].get(s), + &packed_labels, &label_lengths); + } else { + exceed_cudnn_limit = LabelTensorToPackedVector(labels, param_.padding_mask.value(), + &packed_labels, &label_lengths); + } + +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + if (!param_.use_data_lengths && !exceed_cudnn_limit) { + cudnn_forward(ctx, s, data, costs, grad, + &data_lengths, &label_lengths, &packed_labels, + max_seq_len, batch_size, alphabet_size); + } else { + baidu_forward(ctx, s, data, costs, grad, + &data_lengths, &label_lengths, &packed_labels, + batch_size, alphabet_size); + } +#else + baidu_forward(ctx, s, data, costs, grad, + &data_lengths, &label_lengths, &packed_labels, + batch_size, alphabet_size); +#endif // __CUDACC__ && CUDNN } virtual void Backward(const OpContext &ctx, @@ -221,12 +316,143 @@ class CTCLossOp : public Operator { Tensor data_grad_computed = out_data[ctc_loss::kGrad].get(s); - Assign(data_grad, req[ctc_loss::kData], - broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed); +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + if (!param_.use_data_lengths && !exceed_cudnn_limit) { + cudnn_backward_extra(s, data_grad, output_grad, data_grad_computed); + } else { + baidu_backward_extra(req, data_grad, output_grad, data_grad_computed); + } +#else + baidu_backward_extra(req, data_grad, output_grad, data_grad_computed); +#endif } private: CTCLossParam param_; + bool exceed_cudnn_limit; + +#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 + cudnnDataType_t dtype_; + cudnnCTCLossDescriptor_t ctc_desc_; + cudnnTensorDescriptor_t prob_desc_, grad_desc_; + + inline virtual void cudnn_forward(const OpContext &ctx, + mshadow::Stream* s, + mshadow::Tensor data, + mshadow::Tensor costs, + mshadow::Tensor grad, + std::vector* data_lengths, + std::vector* label_lengths, + std::vector* packed_labels, + int max_seq_len, + int batch_size, + int alphabet_size) { + using namespace mshadow; + + // call cudnn to calculate ctc loss + dtype_ = CUDNN_DATA_FLOAT; + int dims[3], strides[3]; + size_t workspace_bytes; + int workspace_size; + dims[0] = max_seq_len; + dims[1] = batch_size; + dims[2] = alphabet_size; + strides[0] = batch_size*alphabet_size; + strides[1] = alphabet_size; + strides[2] = 1; + cudnnCTCLossAlgo_t ctc_algo = CUDNN_CTC_LOSS_ALGO_DETERMINISTIC; + CUDNN_CALL(cudnnSetTensorNdDescriptor(prob_desc_, + dtype_, + 3, + dims, + strides)); + CUDNN_CALL(cudnnSetTensorNdDescriptor(grad_desc_, + dtype_, + 3, + dims, + strides)); + CUDNN_CALL(cudnnGetCTCLossWorkspaceSize(s->dnn_handle_, + prob_desc_, + grad_desc_, + packed_labels->data(), + label_lengths->data(), + data_lengths->data(), + ctc_algo, + ctc_desc_, + &workspace_bytes)); + workspace_size = workspace_bytes/sizeof(real_t); + + Tensor temp_space = + ctx.requested[ctc_loss::kTempSpace].get_space_typed( + mshadow::Shape1(workspace_size+data.shape_.FlatTo1D()[0]), s); + + Tensor work_space(temp_space.dptr_, + mshadow::Shape1(workspace_size), s); + Tensor prob(temp_space.dptr_+workspace_size, + data.shape_, s); + + // since the input is activation before softmax and cudnn ctc takes softmax + // apply softmax to inputs first. + mxnet_op::Softmax(s, data.dptr_, prob.dptr_, data.shape_, 2); + + CUDNN_CALL(cudnnCTCLoss(s->dnn_handle_, + prob_desc_, + prob.dptr_, + packed_labels->data(), + label_lengths->data(), + data_lengths->data(), + costs.dptr_, + grad_desc_, + grad.dptr_, + ctc_algo, + ctc_desc_, + work_space.dptr_, + workspace_bytes)); + } + inline virtual void cudnn_backward_extra(mshadow::Stream* s, + mshadow::Tensor data_grad, + mshadow::Tensor output_grad, + mshadow::Tensor data_grad_computed) { + mxnet_op::SoftmaxGrad(s, + output_grad.dptr_, data_grad_computed.dptr_, data_grad.dptr_, data_grad.shape_, 2); + } +#endif // __CUDACC__ && CUDNN + + inline virtual void baidu_forward(const OpContext &ctx, + mshadow::Stream* s, + mshadow::Tensor data, + mshadow::Tensor costs, + mshadow::Tensor grad, + std::vector* data_lengths, + std::vector* label_lengths, + std::vector* packed_labels, + int batch_size, + int alphabet_size) { + using namespace mshadow; + // allocate temporary workspace + size_t size_bytes; + bool gpu = data.kDevCPU ? false : true; + get_workspace_size(label_lengths, data_lengths, alphabet_size, + batch_size, gpu, &size_bytes); + + // round-up so there are enough elems in memory + int num_tmp_elems = (size_bytes + sizeof(real_t) - 1) / sizeof(real_t); + Tensor workspace = + ctx.requested[ctc_loss::kTempSpace].get_space_typed( + Shape1(num_tmp_elems), s); + + compute_ctc_cost(data, costs.dptr_, grad.dptr_, packed_labels->data(), + label_lengths->data(), data_lengths->data(), + workspace.dptr_, ctx.is_train); + } + + inline virtual void baidu_backward_extra(const std::vector &req, + mshadow::Tensor data_grad, + mshadow::Tensor output_grad, + mshadow::Tensor data_grad_computed) { + Assign(data_grad, req[ctc_loss::kData], + mshadow::expr::broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed); + } }; // class CTCLossOp template @@ -240,15 +466,22 @@ class CTCLossProp : public OperatorProperty { int NumOutputs() const override { return 2; } std::vector ListArguments() const override { - return {"data", "label"}; + if (param_.use_data_lengths && param_.use_label_lengths) { + return {"data", "label", "data_lengths", "label_lengths"}; + } else if (param_.use_data_lengths) { + return {"data", "label", "data_lengths"}; + } else if (param_.use_label_lengths) { + return {"data", "label", "label_lengths"}; + } else { + return {"data", "label"}; + } } std::vector ListOutputs() const override { return {"output", "grad"}; } - void Init( - const std::vector> &kwargs) override { + void Init(const std::vector> &kwargs) override { param_.Init(kwargs); } @@ -259,7 +492,9 @@ class CTCLossProp : public OperatorProperty { bool InferShape(std::vector *in_shape, std::vector *out_shape, std::vector *aux_shape) const override { using namespace mshadow; - CHECK_EQ(in_shape->size(), 2U) << "Expect two inputs to the symbol."; + index_t expected_inputs = 2+param_.use_data_lengths+param_.use_label_lengths; + CHECK_EQ(in_shape->size(), expected_inputs) + << "Expect " << expected_inputs << " inputs to the symbol."; const TShape &dshape = (*in_shape)[ctc_loss::kData]; const TShape &lshape = (*in_shape)[ctc_loss::kLabel]; @@ -267,10 +502,24 @@ class CTCLossProp : public OperatorProperty { CHECK_EQ(lshape.ndim(), 2U) << "The labels array must be of rank 2."; CHECK_EQ(dshape[1], lshape[0]) << "The batch size for the labels and data arrays must be the same."; + if (param_.use_data_lengths) { + int kInputLength = 2; + const TShape &dlshape = (*in_shape)[kInputLength]; + CHECK_EQ(dlshape.ndim(), 1U) << "Data length array must be a vector."; + CHECK_EQ(dlshape[0], dshape[1]) + << "The batch size for the data and data lengths must be the same."; + } + if (param_.use_label_lengths) { + int kLabelLength = 2+param_.use_data_lengths; + const TShape &llshape = (*in_shape)[kLabelLength]; + CHECK_EQ(llshape.ndim(), 1U) << "Label length array must be a vector."; + CHECK_EQ(llshape[0], lshape[0]) + << "The batch size for the labels and label lengths must be the same."; + } CHECK_GE(dshape[0], lshape[1]) << "The max number of labels cannot exceed " "the maximum sequence length of the " - "input."; + "data."; TShape oshape(1); oshape[0] = dshape[1]; // batch size diff --git a/src/operator/contrib/ctc_loss.cc b/src/operator/contrib/ctc_loss.cc index 3727cee10b1c..d544a1fdec04 100644 --- a/src/operator/contrib/ctc_loss.cc +++ b/src/operator/contrib/ctc_loss.cc @@ -31,7 +31,7 @@ namespace mshadow { template ctcStatus_t compute_ctc_cost(const Tensor activations, DType *costs, DType *grads, int *labels, - int *label_lengths, int *input_lengths, + int *label_lengths, int *data_lengths, void *workspace, int train) { int minibatch = static_cast(activations.size(1)); int alphabet_size = static_cast(activations.size(2)); @@ -39,10 +39,10 @@ ctcStatus_t compute_ctc_cost(const Tensor activations, mxnet_warpctc::CpuCTC ctc(alphabet_size, minibatch, workspace, blank_label); if (train) return ctc.cost_and_grad(activations.dptr_, grads, costs, labels, - label_lengths, input_lengths); + label_lengths, data_lengths); else return ctc.score_forward(activations.dptr_, costs, labels, label_lengths, - input_lengths); + data_lengths); } } // namespace mshadow @@ -100,6 +100,12 @@ information. .add_argument("data", "NDArray-or-Symbol", "Input data to the ctc_loss op.") .add_argument("label", "NDArray-or-Symbol", "Ground-truth labels for the loss.") + .add_argument("data_lengths", "NDArray-or-Symbol", + "Lengths of data for each of the samples. Only required " + "when use_data_lengths is true.") + .add_argument("label_lengths", "NDArray-or-Symbol", + "Lengths of labels for each of the samples. Only required " + "when use_label_lengths is true.") .add_arguments(CTCLossParam::__FIELDS__()); NNVM_REGISTER_OP(_contrib_CTCLoss).add_alias("_contrib_ctc_loss"); diff --git a/src/operator/sequence_op_common.h b/src/operator/sequence_op_common.h index 9e5843161087..724e0e0da121 100644 --- a/src/operator/sequence_op_common.h +++ b/src/operator/sequence_op_common.h @@ -32,9 +32,10 @@ namespace mxnet { namespace op { -template -void IndexTensorToVector(mshadow::Tensor data, - std::vector *index_vec) { +template +typename std::enable_if::value>::type +IndexTensorToVector(mshadow::Tensor data, + std::vector *index_vec) { int max_seq_len = data.shape_.Size(); #if MXNET_USE_CUDA DType *temp_index = @@ -44,18 +45,19 @@ void IndexTensorToVector(mshadow::Tensor data, cudaMemcpyDeviceToHost, data.stream_->stream_); CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error"; for (int i = 0; i < max_seq_len; ++i) { - (*index_vec)[i] = static_cast(temp_index[i]); + (*index_vec)[i] = static_cast(temp_index[i]); } free(temp_index); #endif } -template -void IndexTensorToVector(mshadow::Tensor data, - std::vector *index_vec) { +template +typename std::enable_if::value>::type +IndexTensorToVector(mshadow::Tensor data, + std::vector *index_vec) { int max_seq_len = data.shape_.Size(); DType *index_array = static_cast(data.dptr_); for (int i = 0; i < max_seq_len; ++i) - (*index_vec)[i] = static_cast(index_array[i]); + (*index_vec)[i] = static_cast(index_array[i]); } } // namespace op diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 35a20f935573..11d146cae840 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -29,6 +29,7 @@ from test_optimizer import * from test_random import * from test_gluon import * +from test_loss import * #from test_rnn import * from test_gluon_rnn import * from test_sparse_operator import test_cast_storage_ex, test_sparse_dot diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py index 714ea7562fdb..b864215ca1d1 100644 --- a/tests/python/unittest/test_loss.py +++ b/tests/python/unittest/test_loss.py @@ -165,6 +165,36 @@ def test_l1_loss(): assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1 +def test_ctc_loss(): + loss = gluon.loss.CTCLoss(padding_mask=0) + l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[2,1,0,0],[3,2,2,0]])) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss(layout='TNC', padding_mask=0) + l = loss(mx.nd.ones((20,2,4)), mx.nd.array([[2,1,0,0],[3,2,2,0]])) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss(layout='TNC', label_layout='TN', padding_mask=0) + l = loss(mx.nd.ones((20,2,4)), mx.nd.array([[2,1,0,0],[3,2,2,0]]).T) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss(padding_mask=-1) + l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[2,1,-1,-1],[3,2,2,-1]])) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss() + l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[2,1,2,2],[3,2,2,2]]), None, mx.nd.array([2,3])) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss() + l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,-1,-1],[3,2,2,-1]]), mx.nd.array([20,20])) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss() + l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,3,3],[3,2,2,3]]), mx.nd.array([20,20]), mx.nd.array([2,3])) + mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741])) + + def test_sample_weight_loss(): mx.random.seed(1234) np.random.seed(1234) From 0a3ee080d9706702c6c279a7051aef1fa806fd34 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Fri, 25 Aug 2017 03:03:24 +0800 Subject: [PATCH 028/448] Fix symbol load json (#6420) --- cpp-package/include/mxnet-cpp/symbol.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp index ee1a11e26a40..11590fad6041 100644 --- a/cpp-package/include/mxnet-cpp/symbol.hpp +++ b/cpp-package/include/mxnet-cpp/symbol.hpp @@ -103,6 +103,7 @@ inline Symbol Symbol::Load(const std::string &file_name) { return Symbol(handle); } inline Symbol Symbol::LoadJSON(const std::string &json_str) { + op_map(); SymbolHandle handle; CHECK_EQ(MXSymbolCreateFromJSON(json_str.c_str(), &(handle)), 0); return Symbol(handle); From 7f90a39c6d3e42cfb91946120a4f90797b3a06a0 Mon Sep 17 00:00:00 2001 From: gurumurthys Date: Thu, 24 Aug 2017 12:11:18 -0700 Subject: [PATCH 029/448] Added gen_data.py and modified README.md for bi-lstm-sort example (#6549) --- example/bi-lstm-sort/README.md | 6 +++++- example/bi-lstm-sort/gen_data.py | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 example/bi-lstm-sort/gen_data.py diff --git a/example/bi-lstm-sort/README.md b/example/bi-lstm-sort/README.md index b56b671c428e..a590a18bfbc0 100644 --- a/example/bi-lstm-sort/README.md +++ b/example/bi-lstm-sort/README.md @@ -2,9 +2,13 @@ This is an example of using bidirection lstm to sort an array. Firstly, generate data by: - cd data python gen_data.py +Move generated txt files to data directory + + mkdir data + mv *.txt data + Then, train the model by: python lstm_sort.py diff --git a/example/bi-lstm-sort/gen_data.py b/example/bi-lstm-sort/gen_data.py new file mode 100644 index 000000000000..55ce1cfba2fb --- /dev/null +++ b/example/bi-lstm-sort/gen_data.py @@ -0,0 +1,20 @@ +import random + +vocab = [str(x) for x in range(100, 1000)] +sw_train = open("sort.train.txt", "w") +sw_test = open("sort.test.txt", "w") +sw_valid = open("sort.valid.txt", "w") + +for i in range(1000000): + seq = " ".join([vocab[random.randint(0, len(vocab) - 1)] for j in range(5)]) + k = i % 50 + if k == 0: + sw_test.write(seq + "\n") + elif k == 1: + sw_valid.write(seq + "\n") + else: + sw_train.write(seq + "\n") + +sw_train.close() +sw_test.close() +sw_valid.close() From c584b516b15eedb036a5cb598674031ffb751fb4 Mon Sep 17 00:00:00 2001 From: Zehao Shi Date: Fri, 25 Aug 2017 03:14:26 +0800 Subject: [PATCH 030/448] Fix RCNN multi-gpu bucketing warning which may cause OOM error. (#6965) * Fix a spelling mistake. * FIX pad example * fix smooth l1 comment * Fix rcnn multi-gpu bucketing warning --- example/rcnn/rcnn/core/loader.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/example/rcnn/rcnn/core/loader.py b/example/rcnn/rcnn/core/loader.py index 826ee20f080c..fdd6e5c386f1 100644 --- a/example/rcnn/rcnn/core/loader.py +++ b/example/rcnn/rcnn/core/loader.py @@ -165,11 +165,16 @@ def reset(self): vert = np.logical_not(horz) horz_inds = np.where(horz)[0] vert_inds = np.where(vert)[0] + # Avoid putting different aspect ratio image into the same bucket, + # which may cause bucketing warning. + pad_horz = self.batch_size - len(horz_inds) % self.batch_size + pad_vert = self.batch_size - len(vert_inds) % self.batch_size + horz_inds = np.hstack([horz_inds, horz_inds[:pad_horz]]) + vert_inds = np.hstack([vert_inds, vert_inds[:pad_vert]]) inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds))) - extra = inds.shape[0] % self.batch_size - inds_ = np.reshape(inds[:-extra], (-1, self.batch_size)) - row_perm = np.random.permutation(np.arange(inds_.shape[0])) - inds[:-extra] = np.reshape(inds_[row_perm, :], (-1,)) + inds = np.reshape(inds[:], (-1, self.batch_size)) + row_perm = np.random.permutation(np.arange(inds.shape[0])) + inds = np.reshape(inds[row_perm, :], (-1,)) self.index = inds else: np.random.shuffle(self.index) From 3730f549fa9e9cb30b99e98e1f25a5372f7c9421 Mon Sep 17 00:00:00 2001 From: Adam Russell Date: Thu, 24 Aug 2017 16:21:09 -0400 Subject: [PATCH 031/448] Changed next() method to use the seq_size attribute and not the global variable $seq_size. (#7521) --- perl-package/AI-MXNet/examples/char_lstm.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perl-package/AI-MXNet/examples/char_lstm.pl b/perl-package/AI-MXNet/examples/char_lstm.pl index 54a9e3672f63..9a80ddadf618 100755 --- a/perl-package/AI-MXNet/examples/char_lstm.pl +++ b/perl-package/AI-MXNet/examples/char_lstm.pl @@ -133,7 +133,7 @@ sub BUILD [$offset + 1 , $offset + $self->batch_size*$self->seq_size] )->reshape([$self->batch_size, $self->seq_size]); $self->seq_counter($self->seq_counter + 1); - if($self->seq_counter == $seq_size - 1) + if($self->seq_counter == $self->seq_size - 1) { $self->counter($self->counter + 1); $self->seq_counter(0); From 125a12631c6c051a8b73cb6d2fdaad17ea71b31d Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Thu, 24 Aug 2017 13:44:19 -0700 Subject: [PATCH 032/448] Relaxing condition in slice (#7487) * Relaxing condition in slice * Update ndarray.cc --- src/ndarray/ndarray.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 0d2968626d79..139d97670bec 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -91,7 +91,8 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { using namespace autograd; using namespace mshadow; CHECK(!is_none()) << "NDArray is not initialized"; - CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")"; + CHECK_LE(begin, end) + << "Invalid slicing range [" << begin << ", " << end << ")"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; CHECK_EQ(storage_type(), kDefaultStorage); NDArray ret = *this; From d956d1962ec69df12fa31e7800ba1a89c466f36c Mon Sep 17 00:00:00 2001 From: mrkumar83 Date: Thu, 24 Aug 2017 13:51:24 -0700 Subject: [PATCH 033/448] Fixing loss function code in tutorial (#7583) * Fixing loss function code in tutorial * Updating pull request with feedback --- docs/tutorials/gluon/gluon.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/gluon/gluon.md b/docs/tutorials/gluon/gluon.md index ac1aa3f60f5e..a1688ea121dd 100644 --- a/docs/tutorials/gluon/gluon.md +++ b/docs/tutorials/gluon/gluon.md @@ -102,7 +102,8 @@ To compute loss and backprop for one iteration, we do: label = mx.nd.arange(10) # dummy label with autograd.record(): output = net(data) - loss = gluon.loss.softmax_cross_entropy_loss(output, label) + L = gluon.loss.SoftmaxCrossEntropyLoss() + loss = L(output, label) loss.backward() print('loss:', loss) print('grad:', net.fc1.weight.grad()) @@ -127,9 +128,10 @@ this is a commonly used functionality, gluon provide a `Trainer` class for it: ```python trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01}) -with record(): +with autograd.record(): output = net(data) - loss = gluon.loss.softmax_cross_entropy_loss(output, label) + L = gluon.loss.SoftmaxCrossEntropyLoss() + loss = L(output, label) loss.backward() # do the update. Trainer needs to know the batch size of data to normalize From 7f65a3438c95bea91f99a4cba9f645b177029271 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Thu, 24 Aug 2017 15:00:57 -0700 Subject: [PATCH 034/448] Add MXNet MKL pip install (#7598) --- docs/get_started/install.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/get_started/install.md b/docs/get_started/install.md index 2ab771d4cfef..65126a53254e 100644 --- a/docs/get_started/install.md +++ b/docs/get_started/install.md @@ -150,6 +150,11 @@ pip install graphviz **Step 4** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation). +**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL: +```bash +$ pip install mxnet-mkl +``` +
@@ -313,6 +318,11 @@ pip install graphviz **Step 4** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation). +**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL: +```bash +$ pip install mxnet-cu80mkl +``` +
From b34580ea8f215cf57137a625143506545cbb587d Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 24 Aug 2017 15:08:40 -0700 Subject: [PATCH 035/448] add license to new file (#7599) --- example/bi-lstm-sort/gen_data.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/example/bi-lstm-sort/gen_data.py b/example/bi-lstm-sort/gen_data.py index 55ce1cfba2fb..55af1b45554a 100644 --- a/example/bi-lstm-sort/gen_data.py +++ b/example/bi-lstm-sort/gen_data.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import random vocab = [str(x) for x in range(100, 1000)] From b7106369c99b7d0b7413617c9331d7be5acd91f9 Mon Sep 17 00:00:00 2001 From: Chris Olivier Date: Thu, 24 Aug 2017 15:43:22 -0700 Subject: [PATCH 036/448] nightly build test mnist training and optimizer (#7559) (#7562) * nightly build stochastically choose optimizer (#7559) * Only call MKL script once * Fix 'momentum' and 'multi_precision' optimizer args * fix cmake build for active kvstore * stochastic choice of optimizer for mnist training * Run all three optimizers * Add just lenet test * Trigger CI --- CMakeLists.txt | 39 +++++---- tests/nightly/test_all.sh | 22 ++++- tests/nightly/test_image_classification.sh | 93 ++++++++++++++++++++++ 3 files changed, 135 insertions(+), 19 deletions(-) create mode 100755 tests/nightly/test_image_classification.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index dc9ca5f7bb0c..5e32f6baefe3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -354,7 +354,7 @@ if(USE_CUDA) FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") - list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver + list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver else(MSVC) list(APPEND mxnet_LINKER_LIBS nvrtc cuda cufft cusolver) link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") @@ -419,6 +419,29 @@ else() add_library(mxnet SHARED ${SOURCE}) endif() endif() + +if(USE_DIST_KVSTORE) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ps-lite/CMakeLists.txt) + add_subdirectory("ps-lite") + list(APPEND pslite_LINKER_LIBS pslite protobuf) + target_link_libraries(mxnet debug ${pslite_LINKER_LIBS_DEBUG}) + target_link_libraries(mxnet optimized ${pslite_LINKER_LIBS_RELEASE}) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_DEBUG}) + else() + list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_RELEASE}) + endif() + target_link_libraries(mxnet debug ${pslite_LINKER_LIBS_DEBUG}) + target_link_libraries(mxnet optimized ${pslite_LINKER_LIBS_RELEASE}) + + else() + set(pslite_LINKER_LIBS protobuf zmq-static) + endif() + add_definitions(-DMXNET_USE_DIST_KVSTORE) + include_directories(SYSTEM ${pslite_INCLUDE_DIR}) + list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS}) +endif() + target_link_libraries(mxnet ${mxnet_LINKER_LIBS}) if(USE_PLUGINS_WARPCTC) @@ -433,20 +456,6 @@ if(MSVC AND USE_MXNET_LIB_NAMING) endif() -if(USE_DIST_KVSTORE) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ps-lite/CMakeLists.txt) - add_subdirectory("ps-lite") - list(APPEND pslite_LINKER_LIBS pslite) - target_link_libraries(mxnet debug ${pslite_LINKER_LIBS_DEBUG}) - target_link_libraries(mxnet optimized ${pslite_LINKER_LIBS_RELEASE}) - else() - set(pslite_LINKER_LIBS protobuf zmq-static ) - endif() - add_definitions(-DMXNET_USE_DIST_KVSTORE) - target_link_libraries(mxnet ${pslite_LINKER_LIBS}) - include_directories(SYSTEM ${pslite_INCLUDE_DIR}) -endif() - if(USE_PROFILER) add_definitions(-DMXNET_USE_PROFILER) endif() diff --git a/tests/nightly/test_all.sh b/tests/nightly/test_all.sh index 32913c9f5f5b..04d895fecf21 100755 --- a/tests/nightly/test_all.sh +++ b/tests/nightly/test_all.sh @@ -72,10 +72,24 @@ check_val() { example_dir=../../example/image-classification # python: lenet + mnist test_lenet() { - python $example_dir/train_mnist.py \ - --data-dir `pwd`/data/mnist/ --network lenet --gpus $gpus --num-epochs 10 \ - 2>&1 | tee log - check_val 0.99 + optimizers="adam sgd adagrad" + for optimizer in ${optimizers}; do + echo "OPTIMIZER: $optimizer" + if [ "$optimizer" == "adam" ]; then + learning_rate=0.0005 + desired_accuracy=0.98 + else + learning_rate=0.01 + desired_accuracy=0.99 + fi + python $example_dir/train_mnist.py --lr $learning_rate \ + --network lenet --optimizer $optimizer --gpus $gpus \ + --num-epochs 10 2>&1 | tee log + if [ $? -ne 0 ]; then + return $? + fi + check_val $desired_accuracy + done } juLog -name=Python.Lenet.Mnist -error=Fail test_lenet diff --git a/tests/nightly/test_image_classification.sh b/tests/nightly/test_image_classification.sh new file mode 100755 index 000000000000..93e403a2affc --- /dev/null +++ b/tests/nightly/test_image_classification.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# setup +export LD_LIBRARY_PATH=`pwd`/`dirname $0`/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export PYTHONPATH=`pwd`/`dirname $0`/python +cd `pwd`/`dirname $0` +. sh2ju.sh + +## clean last build log +juLogClean + +if [ -f $(which nvidia-smi) ]; then + if [ $# -eq 1 ]; then + num_gpus=$1 + else + num_gpus=$(nvidia-smi -L | grep "GPU" | wc -l) + fi + gpus=`seq 0 $((num_gpus-1)) | paste -sd ","` + device_arg="--gpus $gpus" +else + device_arg="" +fi + +# build +build() { + make -C ../.. clean + make -C ../.. -j8 + return $? +} + +cp ../../make/config.mk ../.. +cat >>../../config.mk < $expected) print \"$pass\"; else print \"$fail\"}" + rm -f log +} + +example_dir=../../example/image-classification +# python: lenet + mnist +test_lenet() { + optimizers="adam sgd adagrad" + for optimizer in ${optimizers}; do + echo "OPTIMIZER: $optimizer" + if [ "$optimizer" == "adam" ]; then + learning_rate=0.0005 + desired_accuracy=0.98 + else + learning_rate=0.01 + desired_accuracy=0.99 + fi + python $example_dir/train_mnist.py --lr $learning_rate \ + --network lenet --optimizer $optimizer --gpus $gpus \ + --num-epochs 10 2>&1 | tee log + if [ $? -ne 0 ]; then + return $? + fi + check_val $desired_accuracy + done +} +juLog -name=Python.Lenet.Mnist -error=Fail test_lenet + +exit $errors From 4b94360e6e970ea71252f1e79b841e7bc4105de5 Mon Sep 17 00:00:00 2001 From: Saswata Date: Thu, 24 Aug 2017 19:22:25 -0400 Subject: [PATCH 037/448] make MXDataIter work without indices (#7456) indices are optional, custom cpp iterators providing data batches without indices should work while using MXDataIter. --- python/mxnet/io.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/mxnet/io.py b/python/mxnet/io.py index 4e69a8a801cb..314a2b28dfa8 100644 --- a/python/mxnet/io.py +++ b/python/mxnet/io.py @@ -815,10 +815,13 @@ def getindex(self): check_call(_LIB.MXDataIterGetIndex(self.handle, ctypes.byref(index_data), ctypes.byref(index_size))) - address = ctypes.addressof(index_data.contents) - dbuffer = (ctypes.c_uint64* index_size.value).from_address(address) - np_index = np.frombuffer(dbuffer, dtype=np.uint64) - return np_index.copy() + if index_size.value: + address = ctypes.addressof(index_data.contents) + dbuffer = (ctypes.c_uint64* index_size.value).from_address(address) + np_index = np.frombuffer(dbuffer, dtype=np.uint64) + return np_index.copy() + else: + return None def getpad(self): pad = ctypes.c_int(0) From 97a15c2ac602199df45879e1a4e4daac95a1c445 Mon Sep 17 00:00:00 2001 From: Kenji Doi Date: Fri, 25 Aug 2017 13:10:42 +0900 Subject: [PATCH 038/448] NAG also has 'momentum' optimizer args (#7602) --- example/image-classification/common/fit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index dfec2a886b80..aeead0f82a3b 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -171,7 +171,7 @@ def fit(args, network, data_loader, **kwargs): optimizer_params['multi_precision'] = True # Only a limited number of optimizers have 'momentum' property - has_momentum = {'sgd', 'dcasgd'} + has_momentum = {'sgd', 'dcasgd', 'nag'} if args.optimizer in has_momentum: optimizer_params['momentum'] = args.mom From bc468b07421bb6bad2d3039b7a5b4f5d2aa256cc Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 24 Aug 2017 21:14:01 -0700 Subject: [PATCH 039/448] Convert dot to linalg gemm (#7603) * Expands linalg_gemm use. Legacy mshadow::dot use only if no cblas. * Fix cpplint. --- Jenkinsfile | 2 +- .../contrib/deformable_convolution-inl.h | 19 ++++--- src/operator/convolution-inl.h | 19 ++++--- src/operator/convolution_v1-inl.h | 17 ++++-- src/operator/deconvolution-inl.h | 22 ++++++-- src/operator/fully_connected-inl.h | 23 ++------ src/operator/grid_generator-inl.h | 9 ++- src/operator/linalg_impl.h | 56 +++++++++++++++++-- src/operator/spatial_transformer-inl.h | 9 ++- 9 files changed, 124 insertions(+), 52 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index bf237a589c99..2d4cc017c865 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -155,7 +155,7 @@ try { ws('workspace/amalgamation') { init_git() make('cpu', '-C amalgamation/ clean') - make('cpu', '-C amalgamation/ USE_BLAS=openblas') + make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1') } } }, diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h index a8dc6b8f09ed..18c1fa367e67 100644 --- a/src/operator/contrib/deformable_convolution-inl.h +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -44,6 +44,7 @@ #include "../operator_common.h" #include "../nn/im2col.h" #include "./nn/deformable_im2col.h" +#include "../linalg.h" namespace mxnet { @@ -152,7 +153,9 @@ class DeformableConvolutionOp : public Operator { param_.num_deformable_group, col_buffer.dptr()); Tensor output_3d = output_4d[n]; for (index_t g = 0; g < group_; ++g) { - ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); + // Legacy approach shown here for comparison: + // Assign(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); + linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s, req[conv::kOut]); } } if (bias_term_) { @@ -216,7 +219,9 @@ class DeformableConvolutionOp : public Operator { for (index_t n = 0; n < num_; ++n) { Tensor out_grad_3d = out_grad_4d[n]; for (index_t g = 0; g < group_; ++g) { - col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); + // Legacy approach shown here for comparison: + // col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); + linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s); } // gradient w.r.t. input coordinate data @@ -243,12 +248,10 @@ class DeformableConvolutionOp : public Operator { param_.num_deformable_group, col_buffer.dptr()); for (index_t g = 0; g < group_; ++g) { - if (0 == n) { - ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], - dot(out_grad_3d[g], col_buffer_3d[g].T())); - } else { - dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); - } + auto request = (n == 0) ? req[conv::kWeight] : kAddTo; + // Legacy approach shown here for comparison: + // Assign(dweight_3d[g], request, dot(out_grad_3d[g], col_buffer_3d[g].T())); + linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request); } } diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h index 0a2522cccb65..0edaee1dae32 100644 --- a/src/operator/convolution-inl.h +++ b/src/operator/convolution-inl.h @@ -40,6 +40,7 @@ #include #include "./operator_common.h" #include "./nn/im2col.h" +#include "./linalg.h" namespace mxnet { @@ -160,7 +161,9 @@ class ConvolutionOp : public Operator { col_buffer.dptr()); Tensor output_3d = output_4d[n]; for (index_t g = 0; g < group_; ++g) { - ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); + // Legacy approach shown here for comparison: + // Assign(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); + linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s, req[conv::kOut]); } } if (bias_term_) { @@ -219,7 +222,9 @@ class ConvolutionOp : public Operator { Tensor out_grad_3d = out_grad_4d[n]; // gradient w.r.t. input data for (index_t g = 0; g < group_; ++g) { - col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); + // Legacy approach shown here for comparison: + // col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); + linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s); } col2im(s, col_buffer.dptr(), in_grad[conv::kData].shape_, col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, @@ -230,12 +235,10 @@ class ConvolutionOp : public Operator { col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, col_buffer.dptr()); for (index_t g = 0; g < group_; ++g) { - if (0 == n) { - ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], - dot(out_grad_3d[g], col_buffer_3d[g].T())); - } else { - dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); - } + auto request = (n == 0) ? req[conv::kWeight] : kAddTo; + // Legacy approach shown here for comparison: + // Assign(dweight_3d[g], request, dot(out_grad_3d[g], col_buffer_3d[g].T())); + linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request); } } diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h index f39d8e0804bc..0ac940c24b19 100644 --- a/src/operator/convolution_v1-inl.h +++ b/src/operator/convolution_v1-inl.h @@ -37,6 +37,7 @@ #include #include #include "./operator_common.h" +#include "./linalg.h" namespace mxnet { namespace op { @@ -180,7 +181,9 @@ class ConvolutionV1Op : public Operator { for (uint32_t gid = 0; gid < param_.num_group; ++gid) { mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); - temp_dst[gid] = dot(wmat[gid], tmpc); + // Legacy approach shown here for comparison: + // temp_dst[gid] = dot(wmat[gid], tmpc); + linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s); } out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst, mshadow::Shape4(param_.num_filter, @@ -267,15 +270,21 @@ class ConvolutionV1Op : public Operator { Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); if (i == 0) { Tensor tmp_gwmat = gwmat[gid]; - Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T())); + // Legacy approach shown here for comparison: + // Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T())); + linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[conv_v1::kWeight]); } else { - gwmat[gid] += dot(temp_dst[gid], tmpc.T()); + // Legacy approach shown here for comparison: + // gwmat[gid] += dot(temp_dst[gid], tmpc.T()); + linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo); } } for (uint32_t gid = 0; gid < param_.num_group; ++gid) { Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); - tmpc = dot(wmat[gid].T(), temp_dst[gid]); + // Legacy approach shown here for comparison: + // tmpc = dot(wmat[gid].T(), temp_dst[gid]); + linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); } if (param_.pad[0] == 0 && param_.pad[1] == 0) { Assign(gdata.Slice(i, i + step), req[conv_v1::kData], diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h index 9db94a8c5986..dd77c150c970 100644 --- a/src/operator/deconvolution-inl.h +++ b/src/operator/deconvolution-inl.h @@ -34,6 +34,7 @@ #include #include #include "./operator_common.h" +#include "./linalg.h" namespace mxnet { @@ -227,7 +228,9 @@ class DeconvolutionOp : public Operator { for (uint32_t gid = 0; gid < param_.num_group; ++gid) { mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); - tmpc = dot(wmat[gid].T(), temp_dst[gid]); + // Legacy approach shown here for comparison: + // tmpc = dot(wmat[gid].T(), temp_dst[gid]); + linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); } if (o_pad[0] == 0 && o_pad[1] == 0) { out.Slice(i, i + step) = pack_col2patch(temp_col, @@ -335,16 +338,23 @@ class DeconvolutionOp : public Operator { Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); if (i == 0) { Tensor tmp_gwmat = gwmat[gid]; - Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); + // Legacy approach shown here for comparison: + // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); + linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]); } else { - gwmat[gid] += dot(temp_dst[gid], tmpc.T()); + // Legacy approach shown here for comparison: + // gwmat[gid] += dot(temp_dst[gid], tmpc.T()); + linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo); } } - if (req[deconv::kData] == kWriteTo || req[deconv::kData] == kWriteInplace - || req[deconv::kData] == kAddTo) { + if (req[deconv::kData] == kWriteTo || + req[deconv::kData] == kWriteInplace || + req[deconv::kData] == kAddTo) { for (uint32_t gid = 0; gid < param_.num_group; ++gid) { Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); - temp_dst[gid] = dot(wmat[gid], tmpc); + // Legacy approach shown here for comparison: + // temp_dst[gid] = dot(wmat[gid], tmpc); + linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s); } Assign(gdata.Slice(i, i + step), req[deconv::kData], diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h index 7120b5672f60..c507e4251f3e 100644 --- a/src/operator/fully_connected-inl.h +++ b/src/operator/fully_connected-inl.h @@ -33,9 +33,7 @@ #include #include "./operator_common.h" #include "./elemwise_op_common.h" -#if (MSHADOW_USE_CBLAS != 0) #include "linalg.h" -#endif namespace mxnet { namespace op { @@ -110,12 +108,9 @@ class FullyConnectedOp : public Operator { Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); } -#if (MSHADOW_USE_CBLAS == 0) - // Legacy approach for amalgamation build w/out cblas - out = dot(data, wmat.T()); -#else + // Legacy approach shown here for comparison: + // out = dot(data, wmat.T()); linalg_gemm(data, wmat, out, false, true, s); -#endif if (!param_.no_bias) { Tensor bias = in_data[fullc::kBias].get(s); out += repmat(bias, data.size(0)); @@ -167,24 +162,18 @@ class FullyConnectedOp : public Operator { CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; // gradient of weight Tensor gwmat = in_grad[fullc::kWeight].get(s); -#if (MSHADOW_USE_CBLAS == 0) - // Legacy approach for amalgamation build w/out cblas - Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); -#else + // Legacy approach shown here for comparison: + // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); -#endif // gradient of bias if (!param_.no_bias) { Tensor gbias = in_grad[fullc::kBias].get(s); Assign(gbias, req[fullc::kBias], sum_rows(grad)); } // gradient of data -#if (MSHADOW_USE_CBLAS == 0) - // Legacy approach for amalgamation build w/out cblas - Assign(gdata, req[fullc::kData], dot(grad, wmat)); -#else + // Legacy approach shown here for comparison: + // Assign(gdata, req[fullc::kData], dot(grad, wmat)); linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); -#endif } private: diff --git a/src/operator/grid_generator-inl.h b/src/operator/grid_generator-inl.h index 65fb8ccf2e07..0be6e7806bce 100644 --- a/src/operator/grid_generator-inl.h +++ b/src/operator/grid_generator-inl.h @@ -35,6 +35,7 @@ #include #include "./mshadow_op.h" #include "./operator_common.h" +#include "./linalg.h" namespace mxnet { namespace op { @@ -101,7 +102,9 @@ class GridGeneratorOp : public Operator { grid_dst[1] = scalar(-1.0) + tcast(tcast(grid_dst[1] / scalar(param_.target_shape[1]))) * scalar(2.0/(param_.target_shape[0] - 1)); grid_dst[2] = scalar(1.0); - Assign(out, req[grid::kOut], dot(data, grid_dst)); + // Legacy approach shown here for comparison: + // Assign(out, req[grid::kOut], dot(data, grid_dst)); + linalg_gemm(data, grid_dst, out, false, false, s, req[grid::kOut]); break; } // Warping transformation @@ -150,8 +153,10 @@ class GridGeneratorOp : public Operator { param_.target_shape[0] * param_.target_shape[1]); Tensor grad = out_grad[grid::kOut] .get_with_shape(grad_shape, s); + // Legacy approach shown here for comparison: + // Assign(gdata, req[grid::kData], dot(grad, grid_dst.T())); // grad : (batch * 2, H * W) grid_dst.T : (H * W, 3) - Assign(gdata, req[grid::kData] , dot(grad, grid_dst.T())); + linalg_gemm(grad, grid_dst, gdata, false, true, s, req[grid::kData]); break; } case grid::kWarp: { diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h index 1e3b0e66e641..c1e813614c72 100644 --- a/src/operator/linalg_impl.h +++ b/src/operator/linalg_impl.h @@ -56,6 +56,8 @@ inline void check_gemm(const Tensor& A, const Tensor inline \ void linalg_gemm(const Tensor& A, const Tensor& B, \ @@ -69,6 +71,17 @@ void linalg_gemm(const Tensor& A, const Tensor for DType=mshadow::half::half_t. +template<> inline +void linalg_gemm(const Tensor& A, + const Tensor& B, + const Tensor& C, + mshadow::half::half_t alpha, + mshadow::half::half_t beta, + bool tA, bool tB, Stream *s) { + LOG(FATAL) << "FP16 gemm on cpu not implemented!"; +} + #define LINALG_CPU_BATCH_GEMM(DType) \ template<> inline \ void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ @@ -82,6 +95,8 @@ void linalg_batch_gemm(const Tensor& A, const Tensor< LINALG_CPU_BATCH_GEMM(float) LINALG_CPU_BATCH_GEMM(double) +#endif // (MSHADOW_USE_CBLAS != 0) + #ifdef __CUDACC__ template @@ -198,7 +213,7 @@ void linalg_batch_gemm(const Tensor& A, const Tensor< LINALG_GPU_BATCH_GEMM(SgemmBatched, float) LINALG_GPU_BATCH_GEMM(DgemmBatched, double) -#endif +#endif // __CUDACC__ //////////////////////////////// TRSM //////////////////////////////////////////// @@ -218,6 +233,8 @@ inline void check_trsm(const Tensor& A, const Tensor inline \ void linalg_trsm(const Tensor& A, const Tensor& B, \ @@ -243,6 +260,8 @@ void linalg_batch_trsm(const Tensor& A, const Tensor< LINALG_CPU_BATCH_TRSM(float) LINALG_CPU_BATCH_TRSM(double) +#endif // (MSHADOW_USE_CBLAS != 0) + #ifdef __CUDACC__ // cublas col-major processing accounted for by switching sides and fill mode @@ -297,7 +316,7 @@ void linalg_batch_trsm(const Tensor& A, const Tensor< LINALG_GPU_BATCH_TRSM(StrsmBatched, float) LINALG_GPU_BATCH_TRSM(DtrsmBatched, double) -#endif +#endif // __CUDACC__ /*! * \brief Performs gemm, setting alpha and beta as appropriate for `req`. @@ -332,6 +351,31 @@ inline void linalg_gemm(const Tensor& A, } } +// A cpu specialization for linalg_gemm that uses mshadow::dot(), if no cblas. +#if (MSHADOW_USE_CBLAS == 0) +template +inline void linalg_gemm(const Tensor& A, + const Tensor& B, + const Tensor& C, + bool tA, bool tB, Stream *s, + mxnet::OpReqType req) { + using namespace mxnet; + switch (req) { + case kNullOp: + break; + case kWriteTo: + case kWriteInplace: + C = dot(tA ? A.T() : A, tB ? B.T() : B); + break; + case kAddTo: + C += dot(tA ? A.T() : A, tB ? B.T() : B); + break; + default: + LOG(FATAL) << "not reached"; + } +} +#endif + //////////////////////////////// TRMM //////////////////////////////////////////// // CPU/GPU-versions of BLAS3 function "trmm". Please refer to the BLAS3-documentation @@ -350,6 +394,8 @@ inline void check_trmm(const Tensor& A, const Tensor inline \ void linalg_trmm(const Tensor& A, const Tensor& B, \ @@ -375,6 +421,8 @@ void linalg_batch_trmm(const Tensor& A, const Tensor< LINALG_XPU_BATCH_TRMM(cpu, float) LINALG_XPU_BATCH_TRMM(cpu, double) +#endif // (MSHADOW_USE_CBLAS != 0) + #ifdef __CUDACC__ // cublas col-major processing accounted for by switching sides and fill mode @@ -401,7 +449,7 @@ LINALG_GPU_TRMM(Dtrmm, double) LINALG_XPU_BATCH_TRMM(gpu, float) LINALG_XPU_BATCH_TRMM(gpu, double) -#endif +#endif // __CUDACC__ //////////////////////////////// POTRF //////////////////////////////////////////// @@ -437,7 +485,7 @@ void linalg_batch_potrf(const Tensor& A, bool lower, LINALG_CPU_BATCH_POTRF(float) LINALG_CPU_BATCH_POTRF(double) -#if MXNET_USE_CUSOLVER == 1 +#if defined(__CUDACC__) && MXNET_USE_CUSOLVER == 1 #define LINALG_GPU_BUFFSIZE_POTRF(fname, DType) \ inline int linalg_potrf_buffsize(const Tensor& A, bool lower, Stream *s) { \ diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h index 77967579340f..e29ad49c4aa6 100644 --- a/src/operator/spatial_transformer-inl.h +++ b/src/operator/spatial_transformer-inl.h @@ -35,6 +35,7 @@ #include #include #include "./operator_common.h" +#include "./linalg.h" namespace mxnet { @@ -100,7 +101,9 @@ class SpatialTransformerOp : public Operator { Copy(grid_dst, workspace, grid_dst.stream_); for (index_t batch = 0; batch < data.size(0); batch++) { if (param_.transform_type == st::kAffine) { - grid_src[batch] = dot(loc[batch], grid_dst); + // Legacy approach shown here for comparison: + // grid_src[batch] = dot(loc[batch], grid_dst); + linalg_gemm(loc[batch], grid_dst, grid_src[batch], false, false, s); } } if (param_.sampler_type == st::kBilinear) { @@ -133,7 +136,9 @@ class SpatialTransformerOp : public Operator { } for (index_t batch = 0; batch < data.size(0); batch++) { if (param_.transform_type == st::kAffine) { - gloc[batch] = dot(grid_src[batch], grid_dst.T()); + // Legacy approach shown here for comparison: + // gloc[batch] = dot(grid_src[batch], grid_dst.T()); + linalg_gemm(grid_src[batch], grid_dst, gloc[batch], false, true, s); } } } From 942e88865f0938db1c6284264d92a109f3f5830e Mon Sep 17 00:00:00 2001 From: Hu Shiwen Date: Sat, 26 Aug 2017 00:59:05 +0800 Subject: [PATCH 040/448] fix linalg_impl (#7611) * fix linalg_impl * fix * fix * fix --- Jenkinsfile | 11 ++++++++++- src/operator/linalg_impl.h | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2d4cc017c865..2dfc57c9a265 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -150,7 +150,7 @@ try { } } }, - 'Amalgamation': { + 'Amalgamation MIN': { node('mxnetlinux') { ws('workspace/amalgamation') { init_git() @@ -159,6 +159,15 @@ try { } } }, + 'Amalgamation': { + node('mxnetlinux') { + ws('workspace/amalgamation') { + init_git() + make('cpu', '-C amalgamation/ clean') + make('cpu', '-C amalgamation/ USE_BLAS=openblas') + } + } + }, 'GPU: MKLML': { node('mxnetlinux') { ws('workspace/build-mklml') { diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h index c1e813614c72..e95eff0cc407 100644 --- a/src/operator/linalg_impl.h +++ b/src/operator/linalg_impl.h @@ -324,9 +324,9 @@ LINALG_GPU_BATCH_TRSM(DtrsmBatched, double) * \param A the first operand of the gemm * \param B the second operand of the gemm * \param C the data to be assigned - * \tparam tA whether the `A` operand should be transposed first. - * \tparam tB whether the `B` operand should be transposed first. - * \tparam s the stream to perform the operation + * \param tA whether the `A` operand should be transposed first. + * \param tB whether the `B` operand should be transposed first. + * \param s the stream to perform the operation * \param req the assignment request */ template @@ -353,8 +353,8 @@ inline void linalg_gemm(const Tensor& A, // A cpu specialization for linalg_gemm that uses mshadow::dot(), if no cblas. #if (MSHADOW_USE_CBLAS == 0) -template -inline void linalg_gemm(const Tensor& A, +template +inline void linalg_gemm(const Tensor& A, const Tensor& B, const Tensor& C, bool tA, bool tB, Stream *s, @@ -365,10 +365,34 @@ inline void linalg_gemm(const Tensor& A, break; case kWriteTo: case kWriteInplace: - C = dot(tA ? A.T() : A, tB ? B.T() : B); + if (tA) { + if (tB) { + const_cast&>(C) = dot(A.T(), B.T()); + } else { + const_cast&>(C) = dot(A.T(), B); + } + } else { + if (tB) { + const_cast&>(C) = dot(A, B.T()); + } else { + const_cast&>(C) = dot(A, B); + } + } break; case kAddTo: - C += dot(tA ? A.T() : A, tB ? B.T() : B); + if (tA) { + if (tB) { + const_cast&>(C) += dot(A.T(), B.T()); + } else { + const_cast&>(C) += dot(A.T(), B); + } + } else { + if (tB) { + const_cast&>(C) += dot(A, B.T()); + } else { + const_cast&>(C) += dot(A, B); + } + } break; default: LOG(FATAL) << "not reached"; From 3b3d824320acc41b8a2242683f9155d0fe67a8ca Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 25 Aug 2017 20:20:55 -0700 Subject: [PATCH 041/448] set build status to success only after job ends (#7628) Earlier code marks status as success initially. So any new PR shows jenkins status as success if we see the check mark on github. On opening the full build status, we see that builds haven't even started or are running. If something fails, variable changes to failure then. So even without this merge, a red mark on github indicates that build has failed correctly. That behavior is unchanged. --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2dfc57c9a265..ac34e71a53f1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -10,8 +10,6 @@ docker_run = 'tests/ci_build/ci_build.sh' max_time = 60 // assign any caught errors here err = null -// set build status to success by default -currentBuild.result = "SUCCESS" // initialize source codes def init_git() { @@ -438,6 +436,8 @@ try { } } } + // set build status to success at the end + currentBuild.result = "SUCCESS" } catch (caughtError) { node("mxnetlinux") { sh "echo caught error" From c42453dfb4cd4baf6a0fe12ccc34cda62ea40df9 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 25 Aug 2017 20:21:51 -0700 Subject: [PATCH 042/448] Fix build status of a test (#7629) installs bc required by sh2ju.sh and changes the regex match to capital alphabet as it clashes with a warning thrown by opencv driver --- tests/nightly/test_image_classification.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/nightly/test_image_classification.sh b/tests/nightly/test_image_classification.sh index 93e403a2affc..7ab443dc044c 100755 --- a/tests/nightly/test_image_classification.sh +++ b/tests/nightly/test_image_classification.sh @@ -21,6 +21,8 @@ # setup export LD_LIBRARY_PATH=`pwd`/`dirname $0`/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH export PYTHONPATH=`pwd`/`dirname $0`/python +# bc is required by sh2ju.sh +apt-get install bc cd `pwd`/`dirname $0` . sh2ju.sh @@ -59,8 +61,8 @@ juLog -name=Build -error=Error build # check if the final evaluation accuracy exceed the threshold check_val() { expected=$1 - pass="Final validation >= $expected, Pass" - fail="Final validation < $expected, Fail" + pass="Final validation >= $expected, PASS" + fail="Final validation < $expected, FAIL" python ../../tools/parse_log.py log --format none | tail -n1 | \ awk "{ if (\$3~/^[.0-9]+$/ && \$3 > $expected) print \"$pass\"; else print \"$fail\"}" rm -f log @@ -88,6 +90,6 @@ test_lenet() { check_val $desired_accuracy done } -juLog -name=Python.Lenet.Mnist -error=Fail test_lenet +juLog -name=Python.Lenet.Mnist -error=FAIL test_lenet exit $errors From 50342a432327e4de8cf65d5935c4c80073b35a6a Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Fri, 25 Aug 2017 21:27:34 -0700 Subject: [PATCH 043/448] entire codebase build with mshadow_use_clas=0 (#7625) --- src/operator/linalg_impl.h | 199 +++++++++++++++++++++++-------------- 1 file changed, 127 insertions(+), 72 deletions(-) diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h index e95eff0cc407..27378db201bd 100644 --- a/src/operator/linalg_impl.h +++ b/src/operator/linalg_impl.h @@ -56,7 +56,7 @@ inline void check_gemm(const Tensor& A, const Tensor inline \ @@ -68,19 +68,6 @@ void linalg_gemm(const Tensor& A, const Tensor for DType=mshadow::half::half_t. -template<> inline -void linalg_gemm(const Tensor& A, - const Tensor& B, - const Tensor& C, - mshadow::half::half_t alpha, - mshadow::half::half_t beta, - bool tA, bool tB, Stream *s) { - LOG(FATAL) << "FP16 gemm on cpu not implemented!"; -} #define LINALG_CPU_BATCH_GEMM(DType) \ template<> inline \ @@ -92,10 +79,43 @@ void linalg_batch_gemm(const Tensor& A, const Tensor< linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB); \ } \ } + +#else + +#define LINALG_CPU_GEMM(fname, DType) \ +template<> inline \ +void linalg_gemm(const Tensor& A, const Tensor& B, \ + const Tensor& C, DType alpha, DType beta, \ + bool tA, bool tB, Stream *s) { \ + LOG(FATAL) << "linalg_gemm (without req arg) not implemented by mxnet for cpu, needs cblas!"; \ +} + +#define LINALG_CPU_BATCH_GEMM(DType) \ +template<> inline \ +void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ + const Tensor& C, DType alpha, DType beta, \ + bool tA, bool tB, Stream *s) { \ + LOG(FATAL) << "linalg_batch_gemm not implemented by mxnet for cpu, needs cblas!"; \ +} + +#endif // MSHADOW_USE_CBLAS == 1 + +LINALG_CPU_GEMM(sgemm, float) +LINALG_CPU_GEMM(dgemm, double) + LINALG_CPU_BATCH_GEMM(float) LINALG_CPU_BATCH_GEMM(double) -#endif // (MSHADOW_USE_CBLAS != 0) +// Specialization of linalg_gemm for DType=mshadow::half::half_t. +template<> inline +void linalg_gemm(const Tensor& A, + const Tensor& B, + const Tensor& C, + mshadow::half::half_t alpha, + mshadow::half::half_t beta, + bool tA, bool tB, Stream *s) { + LOG(FATAL) << "FP16 gemm on cpu not implemented!"; +} #ifdef __CUDACC__ @@ -233,7 +253,7 @@ inline void check_trsm(const Tensor& A, const Tensor inline \ @@ -245,8 +265,6 @@ void linalg_trsm(const Tensor& A, const Tensor inline \ @@ -257,11 +275,31 @@ void linalg_batch_trsm(const Tensor& A, const Tensor< linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose); \ } \ } + +#else + +#define LINALG_CPU_TRSM(fname, DType) \ +template<> inline \ +void linalg_trsm(const Tensor& A, const Tensor& B, \ + DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ + LOG(FATAL) << "linalg_trsm not implemented, needs cblas!"; \ +} + +#define LINALG_CPU_BATCH_TRSM(DType) \ +template<> inline \ +void linalg_batch_trsm(const Tensor& A, const Tensor& B, \ + DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ + LOG(FATAL) << "linalg_batch_trsm not implemented, needs cblas!"; \ +} + +#endif // MSHADOW_USE_CBLAS == 1 + +LINALG_CPU_TRSM(strsm, float) +LINALG_CPU_TRSM(dtrsm, double) + LINALG_CPU_BATCH_TRSM(float) LINALG_CPU_BATCH_TRSM(double) -#endif // (MSHADOW_USE_CBLAS != 0) - #ifdef __CUDACC__ // cublas col-major processing accounted for by switching sides and fill mode @@ -351,54 +389,60 @@ inline void linalg_gemm(const Tensor& A, } } -// A cpu specialization for linalg_gemm that uses mshadow::dot(), if no cblas. -#if (MSHADOW_USE_CBLAS == 0) -template -inline void linalg_gemm(const Tensor& A, - const Tensor& B, - const Tensor& C, - bool tA, bool tB, Stream *s, - mxnet::OpReqType req) { - using namespace mxnet; - switch (req) { - case kNullOp: - break; - case kWriteTo: - case kWriteInplace: - if (tA) { - if (tB) { - const_cast&>(C) = dot(A.T(), B.T()); - } else { - const_cast&>(C) = dot(A.T(), B); - } - } else { - if (tB) { - const_cast&>(C) = dot(A, B.T()); - } else { - const_cast&>(C) = dot(A, B); - } - } - break; - case kAddTo: - if (tA) { - if (tB) { - const_cast&>(C) += dot(A.T(), B.T()); - } else { - const_cast&>(C) += dot(A.T(), B); - } - } else { - if (tB) { - const_cast&>(C) += dot(A, B.T()); - } else { - const_cast&>(C) += dot(A, B); - } - } - break; - default: - LOG(FATAL) << "not reached"; - } +#if MSHADOW_USE_CBLAS == 0 + +// A template for a cpu linalg_gemm implementation using mshadow::dot() +#define LINALG_CPU_GEMM_NO_CBLAS(DType) \ +template<> inline \ +void linalg_gemm(const Tensor& A, \ + const Tensor& B, \ + const Tensor& C, \ + bool tA, bool tB, Stream *s, \ + mxnet::OpReqType req) { \ + using namespace mxnet; \ + switch (req) { \ + case kNullOp: \ + break; \ + case kWriteTo: \ + case kWriteInplace: \ + if (tA) { \ + if (tB) { \ + const_cast&>(C) = dot(A.T(), B.T()); \ + } else { \ + const_cast&>(C) = dot(A.T(), B); \ + } \ + } else { \ + if (tB) { \ + const_cast&>(C) = dot(A, B.T()); \ + } else { \ + const_cast&>(C) = dot(A, B); \ + } \ + } \ + break; \ + case kAddTo: \ + if (tA) { \ + if (tB) { \ + const_cast&>(C) += dot(A.T(), B.T()); \ + } else { \ + const_cast&>(C) += dot(A.T(), B); \ + } \ + } else { \ + if (tB) { \ + const_cast&>(C) += dot(A, B.T()); \ + } else { \ + const_cast&>(C) += dot(A, B); \ + } \ + } \ + break; \ + default: \ + LOG(FATAL) << "not reached"; \ + } \ } -#endif + +LINALG_CPU_GEMM_NO_CBLAS(float) +LINALG_CPU_GEMM_NO_CBLAS(double) + +#endif // (MSHADOW_USE_CBLAS == 0) //////////////////////////////// TRMM //////////////////////////////////////////// @@ -418,7 +462,7 @@ inline void check_trmm(const Tensor& A, const Tensor inline \ @@ -430,8 +474,17 @@ void linalg_trmm(const Tensor& A, const Tensor inline \ +void linalg_trmm(const Tensor& A, const Tensor& B, \ + DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ + LOG(FATAL) << "linalg_trmm not implemented, needs cblas!"; \ +} + +#endif // MSHADOW_USE_CBLAS == 1 #define LINALG_XPU_BATCH_TRMM(xpu, DType) \ template<> inline \ @@ -442,11 +495,13 @@ void linalg_batch_trmm(const Tensor& A, const Tensor< linalg_trmm(A[i], B[i], alpha, rightside, lower, transpose, s); \ } \ } + +LINALG_CPU_TRMM(strmm, float) +LINALG_CPU_TRMM(dtrmm, double) + LINALG_XPU_BATCH_TRMM(cpu, float) LINALG_XPU_BATCH_TRMM(cpu, double) -#endif // (MSHADOW_USE_CBLAS != 0) - #ifdef __CUDACC__ // cublas col-major processing accounted for by switching sides and fill mode From b7fcd090e940b923c62d24fee571b0fd4c5418cd Mon Sep 17 00:00:00 2001 From: Kai Li <1196594711@qq.com> Date: Sat, 26 Aug 2017 15:22:09 +0800 Subject: [PATCH 044/448] Update README.md (#7630) --- example/image-classification/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/image-classification/README.md b/example/image-classification/README.md index 3f514e2a391f..1c72a1d78d9f 100644 --- a/example/image-classification/README.md +++ b/example/image-classification/README.md @@ -128,7 +128,7 @@ to calculate the accuracy. | `imagenet1k-resnet-152` | 0.7653 | 0.9312 | | `imagenet1k-resnext-50` | 0.7689 | 0.9332 | | `imagenet1k-resnext-101` | 0.7828 | 0.9408 | -| `imagenet1k-rexnext-101-64x4d` | 0.7911 | 0.9430 | +| `imagenet1k-resnext-101-64x4d` | 0.7911 | 0.9430 | Note: - our Resnet does not need to specify the RGB mean due the data batch From 2e6ef8c88006a8c80dd5f7be3cc4c6ed56cbcaae Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sat, 26 Aug 2017 11:37:10 -0700 Subject: [PATCH 045/448] unit test for csv iter, doc update for libsvmiter (#7623) * add unit test for csv iter * fix lint * add libsvm to mxnet.io doc * update libsvm doc --- docs/api/python/io.md | 1 + python/mxnet/io.py | 3 +- src/io/inst_vector.h | 2 +- src/io/iter_libsvm.cc | 90 +++++++++++++++++--------------- tests/python/unittest/test_io.py | 21 ++++++++ 5 files changed, 73 insertions(+), 44 deletions(-) diff --git a/docs/api/python/io.md b/docs/api/python/io.md index 15f8aa3ce354..ce8245b73fe8 100644 --- a/docs/api/python/io.md +++ b/docs/api/python/io.md @@ -56,6 +56,7 @@ A detailed tutorial is available at io.NDArrayIter io.CSVIter + io.LibSVMIter io.ImageRecordIter io.ImageRecordUInt8Iter io.MNISTIter diff --git a/python/mxnet/io.py b/python/mxnet/io.py index 314a2b28dfa8..b1696815274a 100644 --- a/python/mxnet/io.py +++ b/python/mxnet/io.py @@ -189,6 +189,7 @@ class DataIter(object): -------- NDArrayIter : Data-iterator for MXNet NDArray or numpy-ndarray objects. CSVIter : Data-iterator for csv data. + LibSVMIter : Data-iterator for libsvm data. ImageIter : Data-iterator for images. """ def __init__(self, batch_size=0): @@ -721,7 +722,7 @@ class MXDataIter(DataIter): """A python wrapper a C++ data iterator. This iterator is the Python wrapper to all native C++ data iterators, such - as `CSVIter, `ImageRecordIter`, `MNISTIter`, etc. When initializing + as `CSVIter`, `ImageRecordIter`, `MNISTIter`, etc. When initializing `CSVIter` for example, you will get an `MXDataIter` instance to use in your Python code. Calls to `next`, `reset`, etc will be delegated to the underlying C++ data iterators. diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h index 6dc7bdfd730a..afa19e277653 100644 --- a/src/io/inst_vector.h +++ b/src/io/inst_vector.h @@ -169,7 +169,7 @@ struct TBlobBatch { } /*! \brief destructor */ ~TBlobBatch() { - delete inst_index; + delete[] inst_index; } }; // struct TBlobBatch diff --git a/src/io/iter_libsvm.cc b/src/io/iter_libsvm.cc index 803d19e74481..8e53e6f28712 100644 --- a/src/io/iter_libsvm.cc +++ b/src/io/iter_libsvm.cc @@ -198,19 +198,21 @@ class LibSVMIter: public SparseIIterator { DMLC_REGISTER_PARAMETER(LibSVMIterParam); MXNET_REGISTER_IO_ITER(LibSVMIter) -.describe(R"code(Returns the LibSVM file iterator. This iterator is experimental and -should be used with care. +.describe(R"code(Returns the libsvm file iterator which returns sparse data with `csr` +storage type. This iterator is experimental and should be used with care. -The input data is similar to libsvm file format, except that the indices are expected to be -zero-based instead of one-based. Details of the libsvm format are available at -`https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/` +The input data is stored in a format similar to libsvm file format, except that the indices +are expected to be zero-based instead of one-based. Details of the libsvm format are available +at `https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/` In this function, the `data_shape` parameter is used to set the shape of each line of the data. The dimension of both `data_shape` and `label_shape` are expected to be 1. When `label_libsvm` is set to ``NULL``, both data and label are read from the same file specified -by `data_libsvm`. Otherwise, data is read from `data_libsvm` and label from `label_libsvm`, -in this case, if `data_libsvm` contains label, it will ignored. +by `data_libsvm`. In this case, the data is stored in `csr` storage type, while the label is a 1D +dense array. Otherwise, data is read from `data_libsvm` and label from `label_libsvm`, +in this case, both data and label are stored in csr storage type. If `data_libsvm` contains label, +it will ignored. The `LibSVMIter` only support `round_batch` parameter set to ``True`` for now. So, if `batch_size` is 3 and there are 4 total rows in libsvm file, 2 more examples @@ -221,58 +223,62 @@ If ``data_libsvm = 'data/'`` is set, then all the files in this directory will b Examples:: - // Contents of libsvm file ``data.t``. + # Contents of libsvm file ``data.t``. 1.0 0:0.5 2:1.2 -2.0 -3.0 0:0.6 1:2.4 2:1.2 4 2:-1.2 - // Creates a `LibSVMIter` with `batch_size`=3. - LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), - batch_size = 3) - - // The first batch (data and label) - [[ 0.5 0. 1.2 ] - [ 0. 0. 0. ] - [ 0.6 2.4 1.2 ]] - + # Creates a `LibSVMIter` with `batch_size`=3. + >>> data_iter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), batch_size = 3) + # The data of the first batch is stored in csr storage type + >>> batch = data_iter.next() + >>> csr = batch.data[0] + + >>> csr.asnumpy() + [[ 0.5 0. 1.2 ] + [ 0. 0. 0. ] + [ 0.6 2.4 1.2]] + # The label of first batch + >>> label = batch.label[0] + >>> label [ 1. -2. -3.] + - // The second batch (data and label) + >>> second_batch = data_iter.next() + # The data of the second batch + >>> second_batch.data[0].asnumpy() [[ 0. 0. -1.2 ] [ 0.5 0. 1.2 ] [ 0. 0. 0. ]] - + # The label of the second batch + >>> second_batch.label[0].asnumpy() [ 4. 1. -2.] - // Contents of libsvm file ``label.t`` + # Contents of libsvm file ``label.t`` 1.0 -2.0 0:0.125 -3.0 2:1.2 4 1:1.0 2:-1.2 - // Creates a `LibSVMIter` with specified label file - LibSVMIter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), - label_libsvm = 'label.t', label_shape = (3,), batch_size = 3) - - // Two batches of data read from the above iterator are as follows(data and label): - // The first batch - [[ 0.5 0. 1.2 ] - [ 0. 0. 0. ] - [ 0.6 2.4 1.2 ]] - - [[ 0. 0. 0. ] - [ 0.125 0. 0. ] - [ 0. 0. 1.2 ]] - - // The second batch - [[ 0. 0. -1.2 ] - [ 0.5 0. 1.2 ] - [ 0. 0. 0. ]] - - [[ 0. 1. -1.2 ] - [ 0. 0. 0. ] - [ 0.125 0. 0. ]] + # Creates a `LibSVMIter` with specified label file + >>> data_iter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), + label_libsvm = 'label.t', label_shape = (3,), batch_size = 3) + + # Both data and label are in csr storage type + >>> batch = data_iter.next() + >>> csr_data = batch.data[0] + + >>> csr_data.asnumpy() + [[ 0.5 0. 1.2 ] + [ 0. 0. 0. ] + [ 0.6 2.4 1.2 ]] + >>> csr_label = batch.label[0] + + >>> csr_label.asnumpy() + [[ 0. 0. 0. ] + [ 0.125 0. 0. ] + [ 0. 0. 1.2 ]] )code" ADD_FILELINE) .add_arguments(LibSVMIterParam::__FIELDS__()) diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py index a543463f3663..fb8aa2aa20be 100644 --- a/tests/python/unittest/test_io.py +++ b/tests/python/unittest/test_io.py @@ -257,6 +257,26 @@ def check_libSVMIter_news_data(): check_libSVMIter_synthetic() check_libSVMIter_news_data() +def test_CSVIter(): + def check_CSVIter_synthetic(): + cwd = os.getcwd() + data_path = os.path.join(cwd, 'data.t') + label_path = os.path.join(cwd, 'label.t') + with open(data_path, 'w') as fout: + for i in range(1000): + fout.write(','.join(['1' for _ in range(8*8)]) + '\n') + with open(label_path, 'w') as fout: + for i in range(1000): + fout.write('0\n') + + data_train = mx.io.CSVIter(data_csv=data_path, data_shape=(8,8), + label_csv=label_path, batch_size=100) + expected = mx.nd.ones((100, 8, 8)) + for batch in iter(data_train): + assert_almost_equal(data_train.getdata().asnumpy(), expected.asnumpy()) + + check_CSVIter_synthetic() + if __name__ == "__main__": test_NDArrayIter() if h5py: @@ -265,3 +285,4 @@ def check_libSVMIter_news_data(): test_Cifar10Rec() test_LibSVMIter() test_NDArrayIter_csr() + test_CSVIter() From 1e48e1238609b94cae2af3d0d72b33882b9c5a24 Mon Sep 17 00:00:00 2001 From: dtmoodie Date: Sat, 26 Aug 2017 14:51:34 -0400 Subject: [PATCH 046/448] gpu access of ndarray (#7496) * gpu access of ndarray * gpu access from C++ api * gpu access fix * Update c_api.cc * Update c_api.cc --- cpp-package/include/mxnet-cpp/ndarray.hpp | 1 - src/c_api/c_api.cc | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp index 5ed04a547b85..6bf26432359b 100644 --- a/cpp-package/include/mxnet-cpp/ndarray.hpp +++ b/cpp-package/include/mxnet-cpp/ndarray.hpp @@ -359,7 +359,6 @@ inline int NDArray::GetDType() const { inline const mx_float *NDArray::GetData() const { void *ret; - CHECK_NE(GetContext().GetDeviceType(), DeviceType::kGPU); MXNDArrayGetData(blob_ptr_->handle_, &ret); if (GetDType() != 0) { return NULL; diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 0fe3fe3e302e..088e208c9cdc 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -437,13 +437,7 @@ int MXNDArrayGetData(NDArrayHandle handle, API_BEGIN(); NDArray *arr = static_cast(handle); if (!arr->is_none()) { - CHECK(arr->ctx().dev_mask() == cpu::kDevMask) - << "MXNDArrayGetData can only be called for NDArray on CPU"; - const TBlob &b = arr->data(); - CHECK(b.CheckContiguous()); - MSHADOW_REAL_TYPE_SWITCH(arr->dtype(), DType, { - *out_pdata = b.FlatTo2D().dptr_; - }); + *out_pdata = arr->data().dptr_; } else { *out_pdata = nullptr; } From cb36058dfc19aed092356db26ea4de4676c86f5e Mon Sep 17 00:00:00 2001 From: Eric Junyuan Xie Date: Sat, 26 Aug 2017 17:51:30 -0700 Subject: [PATCH 047/448] refactor cudnn algo reg to no use string (#7561) * refactor cudnn algo reg to no use string * refactor ctx list * fix * refactor save_inputs --- dmlc-core | 2 +- mshadow | 2 +- nnvm | 2 +- python/mxnet/gluon/parameter.py | 59 ++++++------ python/mxnet/metric.py | 8 +- src/c_api/c_api_common.h | 2 + src/c_api/c_api_function.cc | 4 +- src/c_api/c_api_ndarray.cc | 56 +++++++---- src/io/inst_vector.h | 10 +- src/io/iter_mnist.cc | 2 +- src/ndarray/autograd.cc | 111 ++++++++++++---------- src/ndarray/autograd.h | 30 +++--- src/ndarray/ndarray.cc | 6 +- src/operator/contrib/fft-inl.h | 6 +- src/operator/contrib/fft.cc | 10 +- src/operator/contrib/fft.cu | 11 ++- src/operator/contrib/ifft-inl.h | 7 +- src/operator/contrib/ifft.cc | 10 +- src/operator/contrib/ifft.cu | 11 ++- src/operator/contrib/multi_proposal-inl.h | 105 ++------------------ src/operator/contrib/multi_proposal.cu | 6 +- src/operator/contrib/proposal-inl.h | 105 ++------------------ src/operator/contrib/proposal.cc | 6 +- src/operator/contrib/proposal.cu | 6 +- src/operator/convolution-inl.h | 41 ++++++++ src/operator/cudnn_algoreg-inl.h | 90 ++++++++++++------ src/operator/cudnn_algoreg.cc | 13 ++- src/operator/cudnn_convolution-inl.h | 16 ++-- src/operator/cudnn_deconvolution-inl.h | 17 ++-- src/operator/deconvolution-inl.h | 45 +++++++++ 30 files changed, 386 insertions(+), 413 deletions(-) diff --git a/dmlc-core b/dmlc-core index 71bfbd3a9460..e880afeb932d 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 71bfbd3a946075cea66ca9e19bad86dd33c19b46 +Subproject commit e880afeb932d746e55eb92e8c6eb3ff1b3697c48 diff --git a/mshadow b/mshadow index 6d75df228978..380f825b84e2 160000 --- a/mshadow +++ b/mshadow @@ -1 +1 @@ -Subproject commit 6d75df228978ca5f182dd707578ef704099ab5ee +Subproject commit 380f825b84e28216516377e71199a8e14f12352f diff --git a/nnvm b/nnvm index bcfbf903429d..e842c098decf 160000 --- a/nnvm +++ b/nnvm @@ -1 +1 @@ -Subproject commit bcfbf903429d086f16b19b4d202788de06e45536 +Subproject commit e842c098decf9f5eb6bd84e307c58e50078596b7 diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index bef55d67e140..4bc2611a70a7 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -129,14 +129,22 @@ def grad_req(self, req): elif self._data is not None: self._init_grad() - def _check_initialized(self, ctx=None): - if self._data is not None: - if ctx is not None and ctx not in self._data: - raise RuntimeError( - "Parameter %s was not initialized on context %s. " - "It was only initialized on %s."%( - self.name, str(ctx), str(self.list_ctx()))) - return + def _check_and_get(self, arr_dict, ctx): + if arr_dict is not None: + if ctx is list: + return list(arr_dict.values()) + if ctx is None: + if len(self._ctx_list) == 1: + ctx = self._ctx_list[0] + else: + ctx = context.current_context() + ret = arr_dict.get(ctx, None) + if ret is not None: + return ret + raise RuntimeError( + "Parameter %s was not initialized on context %s. " + "It was only initialized on %s."%( + self.name, str(ctx), str(self._ctx_list))) if self._deferred_init: raise DeferredInitializationError raise RuntimeError( @@ -199,6 +207,7 @@ def _finish_deferred_init(self): def _init_impl(self, data, ctx): """Sets data and grad.""" self._data = OrderedDict() + self._ctx_list = list(ctx) for i in ctx: self._data[i] = data.copyto(i) self._init_grad() @@ -327,20 +336,12 @@ def data(self, ctx=None): ------- NDArray on ctx """ - if ctx is None: - list_ctx = self.list_ctx() - if len(list_ctx) == 1: - ctx = list_ctx[0] - else: - ctx = context.current_context() - self._check_initialized(ctx) - return self._data[ctx] + return self._check_and_get(self._data, ctx) def list_data(self): """Returns copies of this parameter on all contexts, in the same order as creation.""" - self._check_initialized() - return list(self._data.values()) + return self._check_and_get(self._data, list) def grad(self, ctx=None): """Returns a gradient buffer for this parameter on one context. @@ -350,26 +351,20 @@ def grad(self, ctx=None): ctx : Context Desired context. """ - if ctx is None: - list_ctx = self.list_ctx() - if len(list_ctx) == 1: - ctx = list_ctx[0] - else: - ctx = context.current_context() - self._check_initialized(ctx) - if self._grad is None: + if self._data is not None and self._grad is None: raise RuntimeError( "Cannot get gradient array for Parameter %s " \ "because grad_req='null'"%(self.name)) - return self._grad[ctx] + return self._check_and_get(self._grad, ctx) def list_grad(self): """Returns gradient buffers on all contexts, in the same order as `values`.""" - self._check_initialized() - assert self._grad is not None, \ - "Parameter %s does not have gradients because grad_req='null'"%self.name - return list(self._grad.values()) + if self._data is not None and self._grad is None: + raise RuntimeError( + "Cannot get gradient array for Parameter %s " \ + "because grad_req='null'"%(self.name)) + return self._check_and_get(self._grad, list) def list_ctx(self): """Returns a list of contexts this parameter is initialized on.""" @@ -377,7 +372,7 @@ def list_ctx(self): if self._deferred_init: return self._deferred_init[1] raise RuntimeError("Parameter %s has not been initialized"%self.name) - return list(self._data.keys()) + return self._ctx_list def zero_grad(self): """Sets gradient buffer on all contexts to 0. No action is taken if diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py index 00cc2da61f3c..a33b00ae8ab3 100644 --- a/python/mxnet/metric.py +++ b/python/mxnet/metric.py @@ -390,13 +390,13 @@ def update(self, labels, preds): for label, pred_label in zip(labels, preds): if pred_label.shape != label.shape: pred_label = ndarray.argmax(pred_label, axis=self.axis) - pred_label = pred_label.asnumpy().astype('int32') - label = label.asnumpy().astype('int32') + label = label.astype('int32') + pred_label = pred_label.astype('int32').as_in_context(label.context) check_label_shapes(label, pred_label) - self.sum_metric += (pred_label.flat == label.flat).sum() - self.num_inst += len(pred_label.flat) + self.sum_metric += ndarray.sum(label == pred_label).asscalar() + self.num_inst += label.size @register diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h index fee3f03f6db0..1ef385609239 100644 --- a/src/c_api/c_api_common.h +++ b/src/c_api/c_api_common.h @@ -84,6 +84,8 @@ struct MXAPIThreadLocalEntry { std::vector arg_shape_data, out_shape_data, aux_shape_data; /*! \brief uint32_t buffer for returning shape pointer */ std::vector arg_shape_buffer, out_shape_buffer, aux_shape_buffer; + /*! \brief bool buffer */ + std::vector save_inputs, save_outputs; // helper function to setup return value of shape array inline static void SetupShapeArrayReturnWithBuffer( const std::vector &shapes, diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc index 3d8b5328c1a0..259c1331c7af 100644 --- a/src/c_api/c_api_function.cc +++ b/src/c_api/c_api_function.cc @@ -188,8 +188,8 @@ int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs, attrs.parsed = params; // TODO(piiswrong): remove state by using FComputeEx auto state = OpStatePtr::Create(params); - AutogradRuntime::Get()->RecordImperativeOperator( - state, attrs.op, attrs, &ndinputs, &ndoutputs); + AutogradRuntime::Get()->RecordOp( + std::move(attrs), &ndinputs, &ndoutputs, state); for (size_t i = 0; i < ndoutputs.size(); ++i) { *reinterpret_cast(outputs[i]) = ndoutputs[i]; diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index d392baf45d3e..64fa74d8b8c3 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -484,9 +484,11 @@ void PushOperator(const OpStatePtr& state, } void ImperativeInvokeImpl(const Context& default_ctx, - const nnvm::NodeAttrs& attrs, + nnvm::NodeAttrs&& attrs, std::vector* p_ndinputs, - std::vector* p_ndoutputs) { + std::vector* p_ndoutputs, + std::vector* p_save_inputs = nullptr, + std::vector* p_save_outputs = nullptr) { static auto& ndfunc = nnvm::Op::GetAttr("FNDArrayFunction"); static auto& createop = nnvm::Op::GetAttr("FCreateOpState"); MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); @@ -514,29 +516,32 @@ void ImperativeInvokeImpl(const Context& default_ctx, FCompute fn = common::GetFCompute(op, "FCompute", ctx); FComputeEx fn_ex = common::GetFCompute(op, "FComputeEx", ctx); if (fn_ex && stype != kDefaultStorage) { - if (AutogradRuntime::Get()->IsRecording()) { - AutogradRuntime::Get()->RecordImperativeFCompute(op, - attrs, &ndinputs, &ndoutputs); - } PushFComputeEx(fn_ex, op, attrs, ctx, read_vars, write_vars, requested, ndinputs, ndoutputs); - } else if (fn) { if (AutogradRuntime::Get()->IsRecording()) { - AutogradRuntime::Get()->RecordImperativeFCompute(op, - attrs, &ndinputs, &ndoutputs); + AutogradRuntime::Get()->RecordOp( + std::move(attrs), &ndinputs, &ndoutputs, OpStatePtr(), + p_save_inputs, p_save_outputs); } + } else if (fn) { PushFCompute(fn, op, attrs, ctx, read_vars, write_vars, requested, ndinputs, ndoutputs, mutate_idx); + if (AutogradRuntime::Get()->IsRecording()) { + AutogradRuntime::Get()->RecordOp( + std::move(attrs), &ndinputs, &ndoutputs, OpStatePtr(), + p_save_inputs, p_save_outputs); + } } else if (createop.count(op)) { auto state = createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types); - if (AutogradRuntime::Get()->IsRecording()) { - AutogradRuntime::Get()->RecordImperativeOperator(state, op, - attrs, &ndinputs, &ndoutputs); - } write_vars.push_back(state.get_var()); PushOperator(state, op, attrs, ctx, read_vars, write_vars, requested, ndinputs, ndoutputs, mutate_idx); + if (AutogradRuntime::Get()->IsRecording()) { + AutogradRuntime::Get()->RecordOp( + std::move(attrs), &ndinputs, &ndoutputs, state, + p_save_inputs, p_save_outputs); + } } else { LOG(FATAL) << "Operator " << op->name << " is not implemented for " @@ -569,7 +574,7 @@ int MXImperativeInvoke(AtomicSymbolCreator creator, SetNDInputsOutputs(op, &ndinputs, &ndoutputs, num_inputs, inputs, num_outputs, infered_num_outputs, num_visible_outputs, outarray); - ImperativeInvokeImpl(Context::CPU(), attrs, &ndinputs, &ndoutputs); + ImperativeInvokeImpl(Context::CPU(), std::move(attrs), &ndinputs, &ndoutputs); if (outarray == nullptr) { ret->ret_handles.clear(); @@ -618,6 +623,20 @@ int MXCreateCachedOp(SymbolHandle handle, auto vars = sym->ListInputs(nnvm::Symbol::kAll); CHECK_GE(vars.size(), 1) << "CachedOp must have at least 1 input."; g->attrs["vars"] = std::make_shared(std::move(vars)); + + const nnvm::IndexedGraph& idx = g->indexed_graph(); + std::vector > save_inputs(idx.num_nodes()); + std::vector > save_outputs(idx.num_nodes()); + for (size_t i = 0; i < idx.num_nodes(); ++i) { + nnvm::NodePtr node = nnvm::Node::Create(); + node->attrs = idx[i].source->attrs; + AutogradRuntime::Get()->GetBackwardDependency( + node, idx[i].source->num_inputs(), idx[i].source->num_outputs(), + &save_inputs[i], &save_outputs[i]); + } + g->attrs["save_inputs"] = std::make_shared(std::move(save_inputs)); + g->attrs["save_outputs"] = std::make_shared(std::move(save_outputs)); + *out = g; API_END(); } @@ -640,7 +659,11 @@ int MXInvokeCachedOp(CachedOpHandle handle, API_BEGIN(); const std::vector& vars = - g->GetAttr >("vars"); + g->GetAttr >("vars"); + std::vector > save_inputs = + g->GetAttr > >("save_inputs"); + std::vector > save_outputs = + g->GetAttr > >("save_outputs"); const nnvm::IndexedGraph& idx = g->indexed_graph(); CHECK_EQ(static_cast(num_inputs), vars.size()) << "Actually number of inputs differs from expected number of inputs"; @@ -661,7 +684,8 @@ int MXInvokeCachedOp(CachedOpHandle handle, in.emplace_back(buff[idx.entry_id(j)]); } std::vector out(node.source->num_outputs()); - ImperativeInvokeImpl(default_ctx, node.source->attrs, &in, &out); + ImperativeInvokeImpl(default_ctx, nnvm::NodeAttrs(node.source->attrs), &in, &out, + &save_inputs[i], &save_outputs[i]); for (size_t j = 0; j < node.source->num_outputs(); ++j) { buff[idx.entry_id(i, j)] = std::move(out[j]); diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h index afa19e277653..7c4e894be924 100644 --- a/src/io/inst_vector.h +++ b/src/io/inst_vector.h @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -173,16 +172,16 @@ struct TBlobBatch { } }; // struct TBlobBatch -class TBlobContainer : public mshadow::TBlob { +class TBlobContainer : public TBlob { public: TBlobContainer(void) - : mshadow::TBlob(), tensor_container_(nullptr) {} + : TBlob(), tensor_container_(nullptr) {} ~TBlobContainer() { if (tensor_container_) { release(); } } - void resize(const mshadow::TShape &shape, int type_flag) { + void resize(const TShape &shape, int type_flag) { if (tensor_container_) { CHECK_EQ(this->type_flag_, type_flag); this->shape_ = shape; @@ -192,13 +191,12 @@ class TBlobContainer : public mshadow::TBlob { this->shape_ = shape; create(); } - this->stride_ = shape_[shape_.ndim() - 1]; } private: void create() { CHECK(tensor_container_ == nullptr); - CHECK_EQ(this->dev_mask_, mshadow::cpu::kDevMask); + CHECK_EQ(this->dev_mask(), mshadow::cpu::kDevMask); MSHADOW_TYPE_SWITCH(this->type_flag_, DType, { auto tensor_container = new mshadow::TensorContainer(false); tensor_container->Resize(mshadow::Shape1(shape_.Size())); diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc index 055af52aaebd..9dbedbbba448 100644 --- a/src/io/iter_mnist.cc +++ b/src/io/iter_mnist.cc @@ -103,7 +103,7 @@ class MNISTIter: public IIterator { out_.batch_size = param_.batch_size; if (param_.shuffle) this->Shuffle(); if (param_.silent == 0) { - mshadow::TShape s; + TShape s; s = batch_data_.shape_; if (param_.flat) { LOG(INFO) << "MNISTIter: load " << (unsigned)img_.size(0) << " images, shuffle=" diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc index 5ecea5decf03..421113f6edd7 100644 --- a/src/ndarray/autograd.cc +++ b/src/ndarray/autograd.cc @@ -29,6 +29,7 @@ #include #include "../executor/graph_executor.h" #include "./autograd.h" +#include "../c_api/c_api_common.h" namespace mxnet { namespace autograd { @@ -101,21 +102,6 @@ void AutogradRuntime::MarkVariables( } } -void AutogradRuntime::RecordImperativeFCompute(const nnvm::Op* op, - const nnvm::NodeAttrs& attrs, - std::vector *p_inputs, - std::vector *p_outputs) { - RecordOp(op, attrs, p_inputs, p_outputs, OpStatePtr()); -} - -void AutogradRuntime::RecordImperativeOperator(const OpStatePtr& state, - const nnvm::Op* op, - const nnvm::NodeAttrs& attrs, - std::vector *p_inputs, - std::vector *p_outputs) { - RecordOp(op, attrs, p_inputs, p_outputs, state); -} - std::shared_ptr AutogradRuntime::_GetSharedRef() { static std::shared_ptr inst(new AutogradRuntime()); return inst; @@ -126,12 +112,58 @@ AutogradRuntime* AutogradRuntime::Get() { return ptr; } -void AutogradRuntime::RecordOp(const nnvm::Op* op, - const nnvm::NodeAttrs& attrs, - std::vector *p_inputs, - std::vector *p_outputs, - const OpStatePtr& state) { +void AutogradRuntime::GetBackwardDependency(const nnvm::NodePtr& node, + uint32_t num_inputs, uint32_t num_outputs, + std::vector *p_save_inputs, + std::vector *p_save_outputs) { static auto& fgradient = nnvm::Op::GetAttr("FGradient"); + std::vector& save_inputs = *p_save_inputs; + std::vector& save_outputs = *p_save_outputs; + save_inputs.resize(num_inputs); + save_outputs.resize(num_outputs); + std::fill(save_inputs.begin(), save_inputs.end(), false); + std::fill(save_outputs.begin(), save_outputs.end(), false); + + node->inputs.clear(); + node->inputs.reserve(num_inputs); + for (uint32_t i = 0; i < num_inputs; ++i) { + node->inputs.emplace_back(NodeEntry{nullptr, i, 0}); + } + + if (fgradient.count(node->op())) { + std::vector ograd_entries; + ograd_entries.reserve(num_outputs); + for (uint32_t i = 0; i < num_outputs; ++i) { + ograd_entries.emplace_back(NodeEntry{nullptr, i, 1}); + } + auto igrad_entries = fgradient[node->op()](node, ograd_entries); + for (const auto& i : igrad_entries) { + if (i.node == nullptr && i.version == 0) { + save_inputs[i.index] = true; + } else if (i.node == node) { + save_outputs[i.index] = true; + } + } + DFSVisit(igrad_entries, [&](const NodePtr& gnode) { + if (!gnode || gnode == node) return; + for (const auto& i : gnode->inputs) { + if (i.node == nullptr && i.version == 0) { + save_inputs[i.index] = true; + } else if (i.node == node) { + save_outputs[i.index] = true; + } + } + }); + } +} + +void AutogradRuntime::RecordOp(nnvm::NodeAttrs&& attrs, + std::vector *p_inputs, + std::vector *p_outputs, + const OpStatePtr& state, + std::vector* p_save_inputs, + std::vector* p_save_outputs) { + MXAPIThreadLocalEntry *local_buff = MXAPIThreadLocalStore::Get(); std::vector& inputs = *p_inputs; std::vector& outputs = *p_outputs; @@ -144,7 +176,6 @@ void AutogradRuntime::RecordOp(const nnvm::Op* op, << "Please call backward first to clear the graph or do this out side of " << "a record section. "; } - if (!fgradient.count(attrs.op)) return; bool need_grad = false; for (const auto& i : inputs) { if (!i.entry_.is_none()) { @@ -155,36 +186,20 @@ void AutogradRuntime::RecordOp(const nnvm::Op* op, if (!need_grad) return; NodePtr nn_node = Node::Create(); - nn_node->attrs = attrs; + nn_node->attrs = std::move(attrs); nn_node->attrs.name = "node_" + std::to_string(node_count_++); - // Get backward dependency - std::vector save_inputs(inputs.size()), save_outputs(outputs.size()); - for (uint32_t i = 0; i < inputs.size(); ++i) { - nn_node->inputs.emplace_back(NodeEntry{nullptr, i, 0}); + if (p_save_inputs == nullptr) { + p_save_inputs = &(local_buff->save_inputs); + p_save_outputs = &(local_buff->save_outputs); + GetBackwardDependency( + nn_node, inputs.size(), outputs.size(), p_save_inputs, p_save_outputs); + } else { + nn_node->inputs.resize(inputs.size()); } - std::vector ograd_entries; - for (uint32_t i = 0; i < outputs.size(); ++i) { - ograd_entries.emplace_back(NodeEntry{nullptr, i, 1}); - } - auto igrad_entries = fgradient[nn_node->op()](nn_node, ograd_entries); - for (const auto& i : igrad_entries) { - if (i.node == nullptr && i.version == 0) { - save_inputs[i.index] = true; - } else if (i.node == nn_node) { - save_outputs[i.index] = true; - } - } - DFSVisit(igrad_entries, [&](const NodePtr& node) { - if (!node || node == nn_node) return; - for (const auto& i : node->inputs) { - if (i.node == nullptr && i.version == 0) { - save_inputs[i.index] = true; - } else if (i.node == nn_node) { - save_outputs[i.index] = true; - } - } - }); + + std::vector& save_inputs = *p_save_inputs; + std::vector& save_outputs = *p_save_outputs; AGNodePtr ag_node = AGNode::Create(nn_node); ag_node->state = state; diff --git a/src/ndarray/autograd.h b/src/ndarray/autograd.h index 199af350bf93..4632bc00ebf5 100644 --- a/src/ndarray/autograd.h +++ b/src/ndarray/autograd.h @@ -95,17 +95,19 @@ class AutogradRuntime { void MarkVariables(const std::vector& variables, const std::vector& grad_reqs, const std::vector& gradients); - /*! \brief record imperative operator which is executed by fcompute. */ - void RecordImperativeFCompute(const nnvm::Op* op, - const nnvm::NodeAttrs& attrs, - std::vector* p_inputs, - std::vector* p_outputs); - /*! \brief record imperative operator which is executed by operator. */ - void RecordImperativeOperator(const OpStatePtr& state, - const nnvm::Op* op, - const nnvm::NodeAttrs& attrs, - std::vector* p_inputs, - std::vector* p_outputs); + /*! \brief find the input/output ndarrays that are needed for backward */ + void GetBackwardDependency( + const nnvm::NodePtr& node, + uint32_t num_inputs, uint32_t num_outputs, + std::vector *p_save_inputs, + std::vector *p_save_outputs); + /*! \brief to record operator, return corresponding node. */ + void RecordOp(nnvm::NodeAttrs&& attrs, + std::vector* p_inputs, + std::vector* p_outputs, + const OpStatePtr& state = OpStatePtr(), + std::vector* p_save_inputs = nullptr, + std::vector* p_save_outputs = nullptr); /*! \brief compute the gradient of outputs w.r.t variables. */ void ComputeGradient(const std::vector& outputs, const std::vector& ograds, @@ -126,12 +128,6 @@ class AutogradRuntime { AutogradRuntime(); private: - /*! \brief to record operator, return corresponding node. */ - void RecordOp(const nnvm::Op* op, - const nnvm::NodeAttrs& attrs, - std::vector* p_inputs, - std::vector* p_outputs, - const OpStatePtr& state); /*! \brief AutogradRuntime singleton. */ static AutogradRuntime* instance_; /*! \brief indicate whether is training. */ diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 139d97670bec..7b79d1051135 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -75,8 +75,7 @@ NDArray NDArray::Reshape(const TShape &shape) const { std::vector inputs, outputs; inputs.emplace_back(*this); outputs.emplace_back(std::move(ret)); - AutogradRuntime::Get()->RecordImperativeFCompute( - op, attrs, &inputs, &outputs); + AutogradRuntime::Get()->RecordOp(std::move(attrs), &inputs, &outputs); return outputs[0]; } else { CHECK_GE(shape_.Size(), shape.Size()) @@ -115,8 +114,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { std::vector inputs, outputs; inputs.emplace_back(*this); outputs.emplace_back(std::move(ret)); - AutogradRuntime::Get()->RecordImperativeFCompute( - op, attrs, &inputs, &outputs); + AutogradRuntime::Get()->RecordOp(std::move(attrs), &inputs, &outputs); return outputs[0]; } else { return ret; diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h index 5092f586fdf7..12474f183e84 100644 --- a/src/operator/contrib/fft-inl.h +++ b/src/operator/contrib/fft-inl.h @@ -54,6 +54,7 @@ struct FFTParam : public dmlc::Parameter { } }; +#if MXNET_USE_CUDA template class FFTOp : public Operator { public: @@ -102,7 +103,6 @@ class FFTOp : public Operator { Shape1(param_.compute_size*dim_*2), s); Tensor complex_data = Tensor(workspace.dptr_, Shape2(param_.compute_size, dim_*2), s); - #if MSHADOW_USE_CUDNN // start fft cufftHandle plan; cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size); @@ -135,7 +135,6 @@ class FFTOp : public Operator { CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS); cufftDestroy(plan_remain); } - #endif } virtual void Backward(const OpContext &ctx, @@ -170,7 +169,6 @@ class FFTOp : public Operator { // In this solution, out_grad must comes from a fft of real signal, // so that it is Hermitian symmetric, giving a real output // but if it is not, remember that we have implemented complex_take_real, and use this - #if MSHADOW_USE_CUDNN cufftHandle plan; cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size); for (size_t idx = 0; idx < num_compute; ++idx) { @@ -203,7 +201,6 @@ class FFTOp : public Operator { req[fft::kData], complex_toreal(complex_data)); cufftDestroy(plan_remain); } - #endif // for bp, we should not divide it // but for comparison with np.fft.ifft, we should do it. // gdata /= dim_; @@ -214,6 +211,7 @@ class FFTOp : public Operator { int dim_, stride_, num_compute, n_ffts; bool init_cufft_; }; // class FFTOp +#endif // MXNET_USE_CUDA // Declare Factory Function, used for dispatch specialization template diff --git a/src/operator/contrib/fft.cc b/src/operator/contrib/fft.cc index 11f8425e07b1..6f78003baebb 100644 --- a/src/operator/contrib/fft.cc +++ b/src/operator/contrib/fft.cc @@ -28,17 +28,13 @@ namespace mxnet { namespace op { template<> Operator *CreateOp(FFTParam param, int dtype) { - LOG(FATAL) << "fft is only available for GPU."; - return NULL; + LOG(FATAL) << "fft is only available for GPU."; + return NULL; } Operator *FFTProp::CreateOperatorEx(Context ctx, std::vector *in_shape, std::vector *in_type) const { - std::vector out_shape, aux_shape; - std::vector out_type, aux_type; - CHECK(InferType(in_type, &out_type, &aux_type)); - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); } DMLC_REGISTER_PARAMETER(FFTParam); diff --git a/src/operator/contrib/fft.cu b/src/operator/contrib/fft.cu index 3017ce76756b..dfe3fbba6124 100644 --- a/src/operator/contrib/fft.cu +++ b/src/operator/contrib/fft.cu @@ -29,11 +29,12 @@ namespace op { template<> Operator* CreateOp(FFTParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new FFTOp(param); - }) - return op; + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new FFTOp(param); + }) + return op; } + } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h index abd5bb22a389..5e89c5b644ce 100644 --- a/src/operator/contrib/ifft-inl.h +++ b/src/operator/contrib/ifft-inl.h @@ -54,6 +54,7 @@ struct IFFTParam : public dmlc::Parameter { } }; +#if MXNET_USE_CUDA template class IFFTOp : public Operator { public: @@ -98,7 +99,6 @@ class IFFTOp : public Operator { Shape1(param_.compute_size*dim_*2), s); Tensor complex_data = Tensor(workspace.dptr_, Shape2(param_.compute_size, dim_*2), s); - #if MSHADOW_USE_CUDNN // start ifft cufftHandle plan; cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size); @@ -131,7 +131,6 @@ class IFFTOp : public Operator { req[ifft::kOut], complex_toreal(complex_data)); cufftDestroy(plan_remain); } - #endif // commenting this out to be consistant with caffe // out /= dim_; } @@ -162,7 +161,6 @@ class IFFTOp : public Operator { Shape1(param_.compute_size*dim_*2), s); Tensor complex_data = Tensor(workspace.dptr_, Shape2(param_.compute_size, dim_*2), s); - #if MSHADOW_USE_CUDNN // start fft cufftHandle plan; cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size); @@ -194,7 +192,6 @@ class IFFTOp : public Operator { CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS); cufftDestroy(plan_remain); } - #endif // commenting this out to be consistant with caffe // gdata /= dim_; } @@ -205,6 +202,8 @@ class IFFTOp : public Operator { bool init_cufft_; }; // class IFFTOp +#endif // MXNET_USE_CUDA + // Declare Factory Function, used for dispatch specialization template Operator* CreateOp(IFFTParam param, int dtype); diff --git a/src/operator/contrib/ifft.cc b/src/operator/contrib/ifft.cc index 0ea3a7ec112f..95c79a785a16 100644 --- a/src/operator/contrib/ifft.cc +++ b/src/operator/contrib/ifft.cc @@ -29,17 +29,13 @@ namespace op { template<> Operator *CreateOp(IFFTParam param, int dtype) { - LOG(FATAL) << "ifft is only available for GPU."; - return NULL; + LOG(FATAL) << "ifft is only available for GPU."; + return NULL; } Operator *IFFTProp::CreateOperatorEx(Context ctx, std::vector *in_shape, std::vector *in_type) const { - std::vector out_shape, aux_shape; - std::vector out_type, aux_type; - CHECK(InferType(in_type, &out_type, &aux_type)); - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); } DMLC_REGISTER_PARAMETER(IFFTParam); diff --git a/src/operator/contrib/ifft.cu b/src/operator/contrib/ifft.cu index 79795d8561bf..35cdb4836b37 100644 --- a/src/operator/contrib/ifft.cu +++ b/src/operator/contrib/ifft.cu @@ -29,11 +29,12 @@ namespace op { template<> Operator* CreateOp(IFFTParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new IFFTOp(param); - }) - return op; + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new IFFTOp(param); + }) + return op; } + } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h index 7cd465e0b09e..ddfe0628f306 100644 --- a/src/operator/contrib/multi_proposal-inl.h +++ b/src/operator/contrib/multi_proposal-inl.h @@ -40,95 +40,6 @@ #include "../operator_common.h" #include "../mshadow_op.h" -// extend NumericalParam -namespace mxnet { -namespace op { - -/*! -* \brief structure for numerical tuple input -* \tparam VType data type of param -*/ -template -struct NumericalParam { - NumericalParam() {} - explicit NumericalParam(VType *begin, VType *end) { - int32_t size = static_cast(end - begin); - info.resize(size); - for (int i = 0; i < size; ++i) { - info[i] = *(begin + i); - } - } - inline size_t ndim() const { - return info.size(); - } - std::vector info; -}; - -template -inline std::istream &operator>>(std::istream &is, NumericalParam ¶m) { - while (true) { - char ch = is.get(); - if (ch == '(') break; - if (!isspace(ch)) { - is.setstate(std::ios::failbit); - return is; - } - } - VType idx; - std::vector tmp; - // deal with empty case - size_t pos = is.tellg(); - char ch = is.get(); - if (ch == ')') { - param.info = tmp; - return is; - } - is.seekg(pos); - // finish deal - while (is >> idx) { - tmp.push_back(idx); - char ch; - do { - ch = is.get(); - } while (isspace(ch)); - if (ch == ',') { - while (true) { - ch = is.peek(); - if (isspace(ch)) { - is.get(); continue; - } - if (ch == ')') { - is.get(); break; - } - break; - } - if (ch == ')') break; - } else if (ch == ')') { - break; - } else { - is.setstate(std::ios::failbit); - return is; - } - } - param.info = tmp; - return is; -} - -template -inline std::ostream &operator<<(std::ostream &os, const NumericalParam ¶m) { - os << '('; - for (index_t i = 0; i < param.info.size(); ++i) { - if (i != 0) os << ','; - os << param.info[i]; - } - // python style tuple - if (param.info.size() == 1) os << ','; - os << ')'; - return os; -} - -} // namespace op -} // namespace mxnet namespace mxnet { namespace op { @@ -144,8 +55,8 @@ struct MultiProposalParam : public dmlc::Parameter { int rpn_post_nms_top_n; float threshold; int rpn_min_size; - NumericalParam scales; - NumericalParam ratios; + nnvm::Tuple scales; + nnvm::Tuple ratios; int feature_stride; bool output_score; bool iou_loss; @@ -161,10 +72,10 @@ struct MultiProposalParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(rpn_min_size).set_default(16) .describe("Minimum height or width in proposal"); tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f; - DMLC_DECLARE_FIELD(scales).set_default(NumericalParam(tmp, tmp + 4)) + DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple(tmp, tmp + 4)) .describe("Used to generate anchor windows by enumerating scales"); tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f; - DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam(tmp, tmp + 3)) + DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple(tmp, tmp + 3)) .describe("Used to generate anchor windows by enumerating ratios"); DMLC_DECLARE_FIELD(feature_stride).set_default(16) .describe("The size of the receptive field each unit in the convolution layer of the rpn," @@ -302,11 +213,11 @@ inline void _Transform(float scale, // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size() inline void GenerateAnchors(const std::vector& base_anchor, - const std::vector& ratios, - const std::vector& scales, + const nnvm::Tuple& ratios, + const nnvm::Tuple& scales, std::vector *out_anchors) { - for (size_t j = 0; j < ratios.size(); ++j) { - for (size_t k = 0; k < scales.size(); ++k) { + for (size_t j = 0; j < ratios.ndim(); ++j) { + for (size_t k = 0; k < scales.ndim(); ++k) { _Transform(scales[k], ratios[j], base_anchor, out_anchors); } } diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu index cb9996344e3e..082de6a397a7 100644 --- a/src/operator/contrib/multi_proposal.cu +++ b/src/operator/contrib/multi_proposal.cu @@ -460,11 +460,11 @@ class MultiProposalGPUOp : public Operator{ base_anchor[1] = 0.0; base_anchor[2] = param_.feature_stride - 1.0; base_anchor[3] = param_.feature_stride - 1.0; - CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size()); + CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim()); std::vector anchors; utils::GenerateAnchors(base_anchor, - param_.ratios.info, - param_.scales.info, + param_.ratios, + param_.scales, &anchors); // Copy generated anchors to GPU diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h index 3d1851cedbac..f989cdec3767 100644 --- a/src/operator/contrib/proposal-inl.h +++ b/src/operator/contrib/proposal-inl.h @@ -38,95 +38,6 @@ #include "../operator_common.h" #include "../mshadow_op.h" -// extend NumericalParam -namespace mxnet { -namespace op { - -/*! -* \brief structure for numerical tuple input -* \tparam VType data type of param -*/ -template -struct NumericalParam { - NumericalParam() {} - explicit NumericalParam(VType *begin, VType *end) { - int32_t size = static_cast(end - begin); - info.resize(size); - for (int i = 0; i < size; ++i) { - info[i] = *(begin + i); - } - } - inline size_t ndim() const { - return info.size(); - } - std::vector info; -}; - -template -inline std::istream &operator>>(std::istream &is, NumericalParam ¶m) { - while (true) { - char ch = is.get(); - if (ch == '(') break; - if (!isspace(ch)) { - is.setstate(std::ios::failbit); - return is; - } - } - VType idx; - std::vector tmp; - // deal with empty case - size_t pos = is.tellg(); - char ch = is.get(); - if (ch == ')') { - param.info = tmp; - return is; - } - is.seekg(pos); - // finish deal - while (is >> idx) { - tmp.push_back(idx); - char ch; - do { - ch = is.get(); - } while (isspace(ch)); - if (ch == ',') { - while (true) { - ch = is.peek(); - if (isspace(ch)) { - is.get(); continue; - } - if (ch == ')') { - is.get(); break; - } - break; - } - if (ch == ')') break; - } else if (ch == ')') { - break; - } else { - is.setstate(std::ios::failbit); - return is; - } - } - param.info = tmp; - return is; -} - -template -inline std::ostream &operator<<(std::ostream &os, const NumericalParam ¶m) { - os << '('; - for (index_t i = 0; i < param.info.size(); ++i) { - if (i != 0) os << ','; - os << param.info[i]; - } - // python style tuple - if (param.info.size() == 1) os << ','; - os << ')'; - return os; -} - -} // namespace op -} // namespace mxnet namespace mxnet { namespace op { @@ -142,8 +53,8 @@ struct ProposalParam : public dmlc::Parameter { int rpn_post_nms_top_n; float threshold; int rpn_min_size; - NumericalParam scales; - NumericalParam ratios; + nnvm::Tuple scales; + nnvm::Tuple ratios; int feature_stride; bool output_score; bool iou_loss; @@ -159,10 +70,10 @@ struct ProposalParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(rpn_min_size).set_default(16) .describe("Minimum height or width in proposal"); tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f; - DMLC_DECLARE_FIELD(scales).set_default(NumericalParam(tmp, tmp + 4)) + DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple(tmp, tmp + 4)) .describe("Used to generate anchor windows by enumerating scales"); tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f; - DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam(tmp, tmp + 3)) + DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple(tmp, tmp + 3)) .describe("Used to generate anchor windows by enumerating ratios"); DMLC_DECLARE_FIELD(feature_stride).set_default(16) .describe("The size of the receptive field each unit in the convolution layer of the rpn," @@ -300,11 +211,11 @@ inline void _Transform(float scale, // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size() inline void GenerateAnchors(const std::vector& base_anchor, - const std::vector& ratios, - const std::vector& scales, + const nnvm::Tuple& ratios, + const nnvm::Tuple& scales, std::vector *out_anchors) { - for (size_t j = 0; j < ratios.size(); ++j) { - for (size_t k = 0; k < scales.size(); ++k) { + for (size_t j = 0; j < ratios.ndim(); ++j) { + for (size_t k = 0; k < scales.ndim(); ++k) { _Transform(scales[k], ratios[j], base_anchor, out_anchors); } } diff --git a/src/operator/contrib/proposal.cc b/src/operator/contrib/proposal.cc index ec539003b944..ccb541a403a2 100644 --- a/src/operator/contrib/proposal.cc +++ b/src/operator/contrib/proposal.cc @@ -335,11 +335,11 @@ class ProposalOp : public Operator{ base_anchor[1] = 0.0; base_anchor[2] = param_.feature_stride - 1.0; base_anchor[3] = param_.feature_stride - 1.0; - CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size()); + CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim()); std::vector anchors; utils::GenerateAnchors(base_anchor, - param_.ratios.info, - param_.scales.info, + param_.ratios, + param_.scales, &anchors); std::memcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size()); diff --git a/src/operator/contrib/proposal.cu b/src/operator/contrib/proposal.cu index 209ef79a2aaf..9f56685a7a7d 100644 --- a/src/operator/contrib/proposal.cu +++ b/src/operator/contrib/proposal.cu @@ -442,11 +442,11 @@ class ProposalGPUOp : public Operator{ base_anchor[1] = 0.0; base_anchor[2] = param_.feature_stride - 1.0; base_anchor[3] = param_.feature_stride - 1.0; - CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size()); + CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim()); std::vector anchors; utils::GenerateAnchors(base_anchor, - param_.ratios.info, - param_.scales.info, + param_.ratios, + param_.scales, &anchors); // Copy generated anchors to GPU diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h index 0edaee1dae32..a9e2c1bd6e94 100644 --- a/src/operator/convolution-inl.h +++ b/src/operator/convolution-inl.h @@ -103,8 +103,49 @@ struct ConvolutionParam : public dmlc::Parameter { index_t DilatedKernelSize(int dim) const { return 1 + (kernel[dim] - 1) * dilate[dim]; } + + bool operator==(const ConvolutionParam& other) const { + return this->kernel == other.kernel && + this->stride == other.stride && + this->dilate == other.dilate && + this->pad == other.pad && + this->num_filter == other.num_filter && + this->num_group == other.num_group && + this->workspace == other.workspace && + this->no_bias == other.no_bias && + this->cudnn_tune == other.cudnn_tune && + this->cudnn_off == other.cudnn_off && + this->layout == other.layout; + } }; +} // namespace op +} // namespace mxnet + +namespace std { +template<> +struct hash { + size_t operator()(const mxnet::op::ConvolutionParam& val) { + size_t ret = 0; + ret = dmlc::HashCombine(ret, val.kernel); + ret = dmlc::HashCombine(ret, val.stride); + ret = dmlc::HashCombine(ret, val.dilate); + ret = dmlc::HashCombine(ret, val.pad); + ret = dmlc::HashCombine(ret, val.num_filter); + ret = dmlc::HashCombine(ret, val.num_group); + ret = dmlc::HashCombine(ret, val.workspace); + ret = dmlc::HashCombine(ret, val.no_bias); + ret = dmlc::HashCombine(ret, val.cudnn_tune); + ret = dmlc::HashCombine(ret, val.cudnn_off); + ret = dmlc::HashCombine(ret, val.layout); + return ret; + } +}; +} // namespace std + +namespace mxnet { +namespace op { + template class ConvolutionOp : public Operator { public: diff --git a/src/operator/cudnn_algoreg-inl.h b/src/operator/cudnn_algoreg-inl.h index dc5db6bbc8b7..b27d2be297fe 100644 --- a/src/operator/cudnn_algoreg-inl.h +++ b/src/operator/cudnn_algoreg-inl.h @@ -61,37 +61,22 @@ class CuDNNAlgo { bool is_tensor_core_algo_; }; +template class CuDNNAlgoReg { public: - template - std::string GetKey(const Param ¶m, const std::vector &in_shape, - const std::vector &out_shape, - cudnnDataType_t cudnn_data_type, - cudnnDataType_t cudnn_forward_compute_type, - cudnnDataType_t cudnn_backward_compute_type, - int sm_arch) { - std::ostringstream oss; - oss << "inputs="; - for (auto &i : in_shape) - oss << i << ";"; - oss << "outputs="; - for (auto &i : out_shape) - oss << i << ";"; - auto dict = param.__DICT__(); - for (auto &k : dict) - oss << k.first << "=" << k.second << ";"; - oss << "cudnn_data_type=" << cudnn_data_type << ";"; - oss << "cudnn_forward_compute_type=" << cudnn_forward_compute_type << ";"; - oss << "cudnn_backward_compute_type=" << cudnn_backward_compute_type << ";"; - // All GPUs of the same compute capability (SM arch) share an algo selection. - oss << "sm_arch=" << sm_arch << ";"; - return oss.str(); - } - - bool Find(std::string key, + bool Find(const ParamType ¶m, + const std::vector &in_shape, + const std::vector &out_shape, + cudnnDataType_t cudnn_data_type, + cudnnDataType_t cudnn_forward_compute_type, + cudnnDataType_t cudnn_backward_compute_type, + int sm_arch, CuDNNAlgo *fwd, CuDNNAlgo *bwd, CuDNNAlgo *flt) { + CHECK(in_shape.size() == 2 || in_shape.size() == 3); + ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, + cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch}; std::lock_guard guard(lock_); auto i = reg_.find(key); if (i != reg_.end()) { @@ -103,10 +88,19 @@ class CuDNNAlgoReg { return false; } - void Register(std::string key, + void Register(const ParamType ¶m, + const std::vector &in_shape, + const std::vector &out_shape, + cudnnDataType_t cudnn_data_type, + cudnnDataType_t cudnn_forward_compute_type, + cudnnDataType_t cudnn_backward_compute_type, + int sm_arch, const CuDNNAlgo &fwd, const CuDNNAlgo &bwd, const CuDNNAlgo &flt) { + CHECK(in_shape.size() == 2 || in_shape.size() == 3); + ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, + cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch}; std::lock_guard guard(lock_); if (reg_.size() % 50 == 0) { LOG(INFO) << "Running performance tests to find the best convolution " @@ -134,9 +128,49 @@ class CuDNNAlgoReg { CuDNNAlgo flt; }; + struct ParamKey { + ParamType param; + TShape data_shape, weight_shape, out_shape; + cudnnDataType_t cudnn_data_type; + cudnnDataType_t cudnn_forward_compute_type; + cudnnDataType_t cudnn_backward_compute_type; + int sm_arch; + + bool operator==(const ParamKey& other) const { + return this->param == other.param && + this->data_shape == other.data_shape && + this->weight_shape == other.weight_shape && + this->out_shape == other.out_shape && + this->cudnn_data_type == other.cudnn_data_type && + this->cudnn_forward_compute_type == other.cudnn_forward_compute_type && + this->cudnn_backward_compute_type == other.cudnn_backward_compute_type && + this->sm_arch == other.sm_arch; + } + }; + + struct ParamHash { + size_t operator()(const ParamKey& key) const { + std::hash hash_param; + size_t ret = hash_param(key.param); + ret = dmlc::HashCombine(ret, key.data_shape); + ret = dmlc::HashCombine(ret, key.weight_shape); + ret = dmlc::HashCombine(ret, key.out_shape); + for (const auto& i : key.out_shape) ret = dmlc::HashCombine(ret, i); + ret = dmlc::HashCombine(ret, static_cast(key.cudnn_data_type)); + ret = dmlc::HashCombine(ret, static_cast(key.cudnn_forward_compute_type)); + ret = dmlc::HashCombine(ret, static_cast(key.cudnn_backward_compute_type)); + ret = dmlc::HashCombine(ret, key.sm_arch); + return ret; + } + }; + std::mutex lock_; - std::unordered_map reg_; + std::unordered_map reg_; }; + +typedef CuDNNAlgoReg CuDNNConvAlgoReg; +typedef CuDNNAlgoReg CuDNNDeconvAlgoReg; + #endif // __CUDACC__ && CUDNN } // namespace op } // namespace mxnet diff --git a/src/operator/cudnn_algoreg.cc b/src/operator/cudnn_algoreg.cc index 5aa8688c8148..5b0e73f0b19d 100644 --- a/src/operator/cudnn_algoreg.cc +++ b/src/operator/cudnn_algoreg.cc @@ -32,9 +32,16 @@ namespace mxnet { namespace op { #if MXNET_USE_CUDNN == 1 -CuDNNAlgoReg *CuDNNAlgoReg::Get() { - static CuDNNAlgoReg *ptr = new CuDNNAlgoReg(); - return ptr; +template<> +CuDNNAlgoReg *CuDNNAlgoReg::Get() { + static CuDNNAlgoReg inst; + return &inst; +} + +template<> +CuDNNAlgoReg *CuDNNAlgoReg::Get() { + static CuDNNAlgoReg inst; + return &inst; } #endif // CUDNN } // namespace op diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h index 428278498337..b2b59944e895 100644 --- a/src/operator/cudnn_convolution-inl.h +++ b/src/operator/cudnn_convolution-inl.h @@ -580,11 +580,10 @@ class CuDNNConvolutionOp : public Operator { const std::vector& out_shape, cudnnDataType_t cudnn_forward_compute_type, cudnnDataType_t cudnn_backward_compute_type) { - std::string key = CuDNNAlgoReg::Get()->GetKey(param_, in_shape, out_shape, dtype_, - cudnn_forward_compute_type, - cudnn_backward_compute_type, - SMArch(ctx.dev_id)); - if (!CuDNNAlgoReg::Get()->Find(key, &forward_algo_, &back_algo_, &back_algo_w_)) { + if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, + cudnn_forward_compute_type, cudnn_backward_compute_type, + SMArch(ctx.dev_id), &forward_algo_, &back_algo_, + &back_algo_w_)) { // Not in algo registry, must determine via *Get*() or *Find*() Engine::VarHandle var = Engine::Get()->NewVariable(); Engine::Get()->PushSync([=](RunContext rctx) { @@ -772,8 +771,11 @@ class CuDNNConvolutionOp : public Operator { // convolution will match only if identically specified. // We're caching results of *Get* as well as *Find*, but these records // will be held distinctly because param_.cudnn_tune is part of the key. - CuDNNAlgoReg::Get()->Register(key, this->forward_algo_, this->back_algo_, - this->back_algo_w_); + CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + SMArch(ctx.dev_id), this->forward_algo_, + this->back_algo_, this->back_algo_w_); }, ctx, {}, {var}); Engine::Get()->WaitForVar(var); Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var); diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h index de3e70c7d6a7..5e9b7c5704d0 100644 --- a/src/operator/cudnn_deconvolution-inl.h +++ b/src/operator/cudnn_deconvolution-inl.h @@ -598,11 +598,11 @@ class CuDNNDeconvolutionOp : public Operator { const std::vector& out_shape, cudnnDataType_t cudnn_forward_compute_type, cudnnDataType_t cudnn_backward_compute_type) { - std::string key = CuDNNAlgoReg::Get()->GetKey(param_, in_shape, out_shape, dtype_, - cudnn_forward_compute_type, - cudnn_backward_compute_type, - SMArch(ctx.dev_id)); - if (!CuDNNAlgoReg::Get()->Find(key, &forward_algo_, &back_algo_, &back_algo_w_)) { + if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + SMArch(ctx.dev_id), &forward_algo_, + &back_algo_, &back_algo_w_)) { // Not in algo registry, must determine via *Get*() or *Find*() Engine::VarHandle var = Engine::Get()->NewVariable(); Engine::Get()->PushSync([=](RunContext rctx) { @@ -793,8 +793,11 @@ class CuDNNDeconvolutionOp : public Operator { // convolution will match only if identically specified. // We're caching results of *Get* as well as *Find*, but these records // will be held distinctly because param_.cudnn_tune is part of the key. - CuDNNAlgoReg::Get()->Register(key, this->forward_algo_, this->back_algo_, - this->back_algo_w_); + CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, + cudnn_forward_compute_type, + cudnn_backward_compute_type, + SMArch(ctx.dev_id), this->forward_algo_, + this->back_algo_, this->back_algo_w_); }, ctx, {}, {var}); Engine::Get()->WaitForVar(var); Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var); diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h index dd77c150c970..a968ce44a800 100644 --- a/src/operator/deconvolution-inl.h +++ b/src/operator/deconvolution-inl.h @@ -144,8 +144,53 @@ struct DeconvolutionParam : public dmlc::Parameter { index_t DilatedKernelSize(int dim) const { return 1 + (kernel[dim] - 1) * dilate[dim]; } + + bool operator==(const DeconvolutionParam& other) const { + return this->kernel == other.kernel && + this->stride == other.stride && + this->dilate == other.dilate && + this->pad == other.pad && + this->adj == other.adj && + this->target_shape == other.target_shape && + this->num_filter == other.num_filter && + this->num_group == other.num_group && + this->workspace == other.workspace && + this->no_bias == other.no_bias && + this->cudnn_tune == other.cudnn_tune && + this->cudnn_off == other.cudnn_off && + this->layout == other.layout; + } }; +} // namespace op +} // namespace mxnet + +namespace std { +template<> +struct hash { + size_t operator()(const mxnet::op::DeconvolutionParam& val) { + size_t ret = 0; + ret = dmlc::HashCombine(ret, val.kernel); + ret = dmlc::HashCombine(ret, val.stride); + ret = dmlc::HashCombine(ret, val.dilate); + ret = dmlc::HashCombine(ret, val.pad); + ret = dmlc::HashCombine(ret, val.adj); + ret = dmlc::HashCombine(ret, val.target_shape); + ret = dmlc::HashCombine(ret, val.num_filter); + ret = dmlc::HashCombine(ret, val.num_group); + ret = dmlc::HashCombine(ret, val.workspace); + ret = dmlc::HashCombine(ret, val.no_bias); + ret = dmlc::HashCombine(ret, val.cudnn_tune); + ret = dmlc::HashCombine(ret, val.cudnn_off); + ret = dmlc::HashCombine(ret, val.layout); + return ret; + } +}; +} // namespace std + +namespace mxnet { +namespace op { + template class DeconvolutionOp : public Operator { public: From 39ff76494b6b1e7a6048ae80d66bb87c46263537 Mon Sep 17 00:00:00 2001 From: Kai Li <1196594711@qq.com> Date: Sun, 27 Aug 2017 13:17:16 +0800 Subject: [PATCH 048/448] Update io.md (#7634) --- docs/api/python/io.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/python/io.md b/docs/api/python/io.md index ce8245b73fe8..ecf3e75ac0d5 100644 --- a/docs/api/python/io.md +++ b/docs/api/python/io.md @@ -35,7 +35,7 @@ Let's see a complete example of how to use data iterator in model training. >>> data = mx.sym.Variable('data') >>> label = mx.sym.Variable('softmax_label') >>> fullc = mx.sym.FullyConnected(data=data, num_hidden=1) ->>> loss = mx.sym.SoftmaxOutput(data=data, label=label) +>>> loss = mx.sym.SoftmaxOutput(data=fullc, label=label) >>> mod = mx.mod.Module(loss, data_names=['data'], label_names=['softmax_label']) >>> mod.bind(data_shapes=nd_iter.provide_data, label_shapes=nd_iter.provide_label) >>> mod.fit(nd_iter, num_epoch=2) From 9aa051c2e87d41b4f2a61fb62728ecdf364f8997 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Sun, 27 Aug 2017 00:14:28 -0700 Subject: [PATCH 049/448] fix tests (#7633) --- tests/python/gpu/test_operator_gpu.py | 4 ++-- tests/python/unittest/test_loss.py | 18 ++++++------------ 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 11d146cae840..0c5771ebffb6 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1346,11 +1346,11 @@ def test_sequence_reverse(): def test_autograd_save_memory(): - x = mx.nd.zeros((128, 1024, 1024), ctx=mx.gpu(0)) + x = mx.nd.zeros((128, 512, 512), ctx=mx.gpu(0)) x.attach_grad() with mx.autograd.record(): - for i in range(50): + for i in range(200): x = x + 1 x.wait_to_read() x.backward() diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py index b864215ca1d1..85875c604bf0 100644 --- a/tests/python/unittest/test_loss.py +++ b/tests/python/unittest/test_loss.py @@ -63,7 +63,6 @@ def get_net(num_hidden): def test_ce_loss(): - mx.random.seed(1234) np.random.seed(1234) nclass = 10 N = 20 @@ -83,7 +82,6 @@ def test_ce_loss(): def test_bce_loss(): - mx.random.seed(1234) np.random.seed(1234) N = 20 data = mx.random.uniform(-1, 1, shape=(N, 20)) @@ -111,7 +109,6 @@ def test_bce_equal_ce2(): def test_kl_loss(): - mx.random.seed(1234) np.random.seed(1234) N = 20 data = mx.random.uniform(-1, 1, shape=(N, 10)) @@ -129,12 +126,11 @@ def test_kl_loss(): def test_l2_loss(): - mx.random.seed(1234) np.random.seed(1234) N = 20 data = mx.random.uniform(-1, 1, shape=(N, 10)) label = mx.random.uniform(-1, 1, shape=(N, 1)) - data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label') + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) output = get_net(1) l = mx.symbol.Variable('label') Loss = gluon.loss.L2Loss() @@ -142,26 +138,25 @@ def test_l2_loss(): loss = Loss(output, l) loss = mx.sym.make_loss(loss) mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) - mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.}, - eval_metric=mx.metric.Loss()) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.1, 'wd': 0.00045}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss()) assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 def test_l1_loss(): - mx.random.seed(1234) np.random.seed(1234) N = 20 data = mx.random.uniform(-1, 1, shape=(N, 10)) label = mx.random.uniform(-1, 1, shape=(N, 1)) - data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label') + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) output = get_net(1) l = mx.symbol.Variable('label') Loss = gluon.loss.L1Loss() loss = Loss(output, l) loss = mx.sym.make_loss(loss) mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) - mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.1}, - initializer=mx.init.Uniform(0.5), eval_metric=mx.metric.Loss()) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=3), eval_metric=mx.metric.Loss()) assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1 @@ -196,7 +191,6 @@ def test_ctc_loss(): def test_sample_weight_loss(): - mx.random.seed(1234) np.random.seed(1234) nclass = 10 N = 20 From e05129774e76206fe890b511c346953107b05fce Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Sun, 27 Aug 2017 01:12:23 -0700 Subject: [PATCH 050/448] [build] explicitly install JDK8 (#7574) * explicitly install openjdk8 * handle earlier version of ubuntu * install software-properties-common * update -y * update commands --- docker/install/scala.sh | 10 +++++++++- docs/get_started/build_from_source.md | 8 +++++++- tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 | 8 +++++++- tests/ci_build/install/ubuntu_install_scala.sh | 9 +++++++-- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/docker/install/scala.sh b/docker/install/scala.sh index bb0bb9c900d4..c1d2de6c75b2 100755 --- a/docker/install/scala.sh +++ b/docker/install/scala.sh @@ -19,7 +19,15 @@ # install libraries for mxnet's scala package on ubuntu -apt-get install -y maven default-jdk + +apt-get install -y software-properties-common +add-apt-repository -y ppa:webupd8team/java +apt-get update +echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections +apt-get install -y oracle-java8-installer +apt-get install -y oracle-java8-set-default + +apt-get install -y maven wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb dpkg -i scala-2.11.8.deb diff --git a/docs/get_started/build_from_source.md b/docs/get_started/build_from_source.md index 4ff2cc09aa82..9bf397bc9f14 100644 --- a/docs/get_started/build_from_source.md +++ b/docs/get_started/build_from_source.md @@ -367,7 +367,13 @@ Both JDK and Maven are required to build the Scala package.
```bash -sudo apt-get install -y maven default-jdk +apt-get install -y software-properties-common +add-apt-repository -y ppa:webupd8team/java +apt-get update +echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections +apt-get install -y oracle-java8-installer +apt-get install -y oracle-java8-set-default +apt-get install -y maven ```
diff --git a/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 b/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 index e9810af6b72c..88fd7cea6fcb 100644 --- a/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 +++ b/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 @@ -23,7 +23,13 @@ RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib RUN pip install nose cpplint 'pylint==1.4.4' 'astroid==1.3.6' # MAVEN -RUN apt-get install -y maven default-jdk +RUN apt-get install -y software-properties-common +RUN add-apt-repository ppa:webupd8team/java -y +RUN apt-get update +RUN echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections +RUN apt-get install -y oracle-java8-installer +RUN apt-get install -y oracle-java8-set-default +RUN apt-get install -y maven # R RUN apt-get install -y software-properties-common r-base-core libcurl4-openssl-dev libssl-dev libxml2-dev diff --git a/tests/ci_build/install/ubuntu_install_scala.sh b/tests/ci_build/install/ubuntu_install_scala.sh index 712eff98b02a..169ece036d2f 100755 --- a/tests/ci_build/install/ubuntu_install_scala.sh +++ b/tests/ci_build/install/ubuntu_install_scala.sh @@ -19,5 +19,10 @@ # install libraries for mxnet's scala package on ubuntu -apt-get update && apt-get install -y \ - maven default-jdk +apt-get install -y software-properties-common +add-apt-repository -y ppa:webupd8team/java +apt-get update +echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections +apt-get install -y oracle-java8-installer +apt-get install -y oracle-java8-set-default +apt-get update && apt-get install -y maven From aceef5abf3db968ee98333c1454a951dfbf07a43 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Mon, 28 Aug 2017 11:00:15 -0700 Subject: [PATCH 051/448] Add script to build doc files for all versions (#7636) * Add script to build doc files for all versions * Fix * Use add versipn script of each different version --- docs/build_version_doc/build_all_version.sh | 82 +++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 docs/build_version_doc/build_all_version.sh diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh new file mode 100755 index 000000000000..140e51a3d3a1 --- /dev/null +++ b/docs/build_version_doc/build_all_version.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is for locally building website for all versions +# Built files are stored in $built +# Version numbers are stored in $tag_list. +# Version numbers are ordered from latest to old and final one is master. +tag_list="0.11.0.rc3 master" + +mxnet_url="https://github.com/apache/incubator-mxnet.git" +mxnet_folder="apache_mxnet" +built="VersionedWeb" +mkdir $built +mkdir "$built/versions" + +git clone $mxnet_url $mxnet_folder --recursive +cd "$mxnet_folder/docs" +tag_file="tag_list.txt" + +# Write all version numbers into $tag_file +for tag in $tag_list; do + if [ $tag != 'master' ] + then + echo "$tag" >> "$tag_file" + fi +done + +# Build all versions and use latest version(First version number in $tag_list) as landing page. +version_num=0 +for tag in $tag_list; do + if [ $tag == 'master' ] + then + git checkout master + else + git checkout "tags/$tag" + fi + + git submodule update || exit 1 + cd .. + make clean + cd docs + make clean + make html USE_OPENMP=0 || exit 1 + python build_version_doc/AddVersion.py --file_path "_build/html/" --current_version "$tag" || exit 1 + + if [ $tag != 'master' ] + then + python build_version_doc/AddPackageLink.py --file_path "_build/html/get_started/install.html" \ + --current_version "$tag" || exit 1 + fi + + if [ $version_num == 0 ] + then + cp -a _build/html/. "../../$built" + else + file_loc="../../$built/versions/$tag" + mkdir "$file_loc" + cp -a _build/html/. "$file_loc" + fi + + ((++version_num)) +done + +mv "$tag_file" "../../$built/tag.txt" +cd ../.. +rm -rf "$mxnet_folder" From e845cec1c09626bd312f76d5b0ba56b1b986c57f Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Mon, 28 Aug 2017 11:05:50 -0700 Subject: [PATCH 052/448] add fashion mnist and move mnists to s3 (#7635) * add fashion mnist and move mnists to s3 * refactor --- python/mxnet/gluon/data/vision.py | 68 ++++++++++++++++++------ tests/python/unittest/test_gluon_data.py | 1 + 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/python/mxnet/gluon/data/vision.py b/python/mxnet/gluon/data/vision.py index b63624508124..24c060c54c84 100644 --- a/python/mxnet/gluon/data/vision.py +++ b/python/mxnet/gluon/data/vision.py @@ -40,6 +40,8 @@ def __init__(self, root, train, transform): self._data = None self._label = None + if not os.path.isdir(self._root): + os.makedirs(self._root) self._get_data() def __getitem__(self, idx): @@ -70,24 +72,29 @@ class MNIST(_DownloadedDataset): transform=lambda data, label: (data.astype(np.float32)/255, label) """ - def __init__(self, root='~/.mxnet/datasets/', train=True, + def __init__(self, root='~/.mxnet/datasets/mnist', train=True, transform=None): + self._base_url = 'https://apache-mxnet.s3.amazonaws.com/gluon/dataset/mnist/' + self._train_data = ('train-images-idx3-ubyte.gz', + '6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d') + self._train_label = ('train-labels-idx1-ubyte.gz', + '2a80914081dc54586dbdf242f9805a6b8d2a15fc') + self._test_data = ('t10k-images-idx3-ubyte.gz', + 'c3a25af1f52dad7f726cce8cacb138654b760d48') + self._test_label = ('t10k-labels-idx1-ubyte.gz', + '763e7fa3757d93b0cdec073cef058b2004252c17') super(MNIST, self).__init__(root, train, transform) def _get_data(self): - if not os.path.isdir(self._root): - os.makedirs(self._root) - url = 'http://data.mxnet.io/data/mnist/' if self._train: - data_file = download(url+'train-images-idx3-ubyte.gz', self._root, - sha1_hash='6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d') - label_file = download(url+'train-labels-idx1-ubyte.gz', self._root, - sha1_hash='2a80914081dc54586dbdf242f9805a6b8d2a15fc') + data, label = self._train_data, self._train_label else: - data_file = download(url+'t10k-images-idx3-ubyte.gz', self._root, - sha1_hash='c3a25af1f52dad7f726cce8cacb138654b760d48') - label_file = download(url+'t10k-labels-idx1-ubyte.gz', self._root, - sha1_hash='763e7fa3757d93b0cdec073cef058b2004252c17') + data, label = self._test_data, self._test_label + + data_file = download(self._base_url + data[0], self._root, + sha1_hash=data[1]) + label_file = download(self._base_url + label[0], self._root, + sha1_hash=label[1]) with gzip.open(label_file, 'rb') as fin: struct.unpack(">II", fin.read(8)) @@ -102,6 +109,38 @@ def _get_data(self): self._label = label +class FashionMNIST(MNIST): + """A dataset of Zalando's article images consisting of fashion products, + a drop-in replacement of the original MNIST dataset from + `https://github.com/zalandoresearch/fashion-mnist`_. + + Each sample is an image (in 3D NDArray) with shape (28, 28, 1). + + Parameters + ---------- + root : str + Path to temp folder for storing data. + train : bool + Whether to load the training or testing set. + transform : function + A user defined callback that transforms each instance. For example:: + + transform=lambda data, label: (data.astype(np.float32)/255, label) + """ + def __init__(self, root='~/.mxnet/datasets/fashion-mnist', train=True, + transform=None): + self._base_url = 'https://apache-mxnet.s3.amazonaws.com/gluon/dataset/fashion-mnist/' + self._train_data = ('train-images-idx3-ubyte.gz', + '0cf37b0d40ed5169c6b3aba31069a9770ac9043d') + self._train_label = ('train-labels-idx1-ubyte.gz', + '236021d52f1e40852b06a4c3008d8de8aef1e40b') + self._test_data = ('t10k-images-idx3-ubyte.gz', + '626ed6a7c06dd17c0eec72fa3be1740f146a2863') + self._test_label = ('t10k-labels-idx1-ubyte.gz', + '17f9ab60e7257a1620f4ad76bbbaf857c3920701') + super(FashionMNIST, self).__init__(root, train, transform) + + class CIFAR10(_DownloadedDataset): """CIFAR10 image classification dataset from `https://www.cs.toronto.edu/~kriz/cifar.html`_. @@ -118,7 +157,7 @@ class CIFAR10(_DownloadedDataset): transform=lambda data, label: (data.astype(np.float32)/255, label) """ - def __init__(self, root='~/.mxnet/datasets/', train=True, + def __init__(self, root='~/.mxnet/datasets/cifar10', train=True, transform=None): self._file_hashes = {'data_batch_1.bin': 'aadd24acce27caa71bf4b10992e9e7b2d74c2540', 'data_batch_2.bin': 'c0ba65cce70568cd57b4e03e9ac8d2a5367c1795', @@ -136,9 +175,6 @@ def _read_batch(self, filename): data[:, 0].astype(np.int32) def _get_data(self): - if not os.path.isdir(self._root): - os.makedirs(self._root) - file_paths = [(name, os.path.join(self._root, 'cifar-10-batches-bin/', name)) for name in self._file_hashes] if any(not os.path.exists(path) or not check_sha1(path, self._file_hashes[name]) diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py index 32298fcd57d5..7f388be73cb3 100644 --- a/tests/python/unittest/test_gluon_data.py +++ b/tests/python/unittest/test_gluon_data.py @@ -71,6 +71,7 @@ def test_sampler(): def test_datasets(): assert len(gluon.data.vision.MNIST(root='data')) == 60000 + assert len(gluon.data.vision.FashionMNIST(root='data')) == 60000 assert len(gluon.data.vision.CIFAR10(root='data', train=False)) == 10000 def test_image_folder_dataset(): From 910b422ba77274ccc1c1ac2b27302212f79d6ad6 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Mon, 28 Aug 2017 12:06:18 -0700 Subject: [PATCH 053/448] add doc for dataset (#7644) --- docs/api/python/gluon.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api/python/gluon.md b/docs/api/python/gluon.md index ac637749f856..ed42a7d61120 100644 --- a/docs/api/python/gluon.md +++ b/docs/api/python/gluon.md @@ -228,6 +228,7 @@ in Python and then deploy with symbolic graph in C++ and Scala. :nosignatures: MNIST + FashionMNIST CIFAR10 ``` From 860dda2cc4741ac8167a7f81bd9d835364d5954a Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Mon, 28 Aug 2017 12:06:38 -0700 Subject: [PATCH 054/448] Change apache package URL to https (#7622) --- docs/build_version_doc/AddPackageLink.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/build_version_doc/AddPackageLink.py b/docs/build_version_doc/AddPackageLink.py index 8fe04b50b5ce..e3cc19824ba4 100644 --- a/docs/build_version_doc/AddPackageLink.py +++ b/docs/build_version_doc/AddPackageLink.py @@ -30,13 +30,13 @@ args = parser.parse_args() tag = args.current_version - src_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \ + src_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \ "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz" % (tag, tag) - pgp_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \ + pgp_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \ "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz.asc" % (tag, tag) - sha_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \ + sha_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \ "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz.sha" % (tag, tag) - md5_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \ + md5_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \ "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz.md5" % (tag, tag) download_str = "
" From 4e116740d1adf78e9e0c4ed6202965db08c2087b Mon Sep 17 00:00:00 2001 From: Pracheer Gupta Date: Mon, 28 Aug 2017 17:22:31 -0700 Subject: [PATCH 055/448] Pip installer for CoreML Converter: mxnet-to-coreml (#7624) * Fixing CoreML converter's README: typos/grammar/etc. * CoreML converter README update: Talk about layers first and then about models. * Providing examples on converting various standard models; calling out issues with InceptionV3. * Fixing CoreML converter's README: typos/grammar/etc. * CoreML converter README update: Talk about layers first and then about models. * Providing examples on converting various standard models; calling out issues with InceptionV3. * Pip installer for converter: mxnet-coreml-converter. Runs only on MacOS and python 2.7. Once inside the directory pip_package, user needs to run: python setup.py bdist_wheel twine upload dist/* Once uploaded it'll look like this: https://testpypi.python.org/pypi/mxnet-coreml-converter Also updated the README for converter to reflect this. Note that we are going with a package per tool for the time being. Please leave feedback if you think it is better to adopt the policy of all the tools in one single package. Unit tests continue to pass. * More informative pypi package documentation. * Updating MacOS in release notes to 10.11 after testing on it. * Changing the name to mxnet-to-coreml and version to 0.1.0. * Added license to setup.py * Updating readme files with the correct pip package name. --- tools/coreml/README.md | 39 +++++++-------- tools/coreml/{ => converter}/utils.py | 0 tools/coreml/mxnet_coreml_converter.py | 3 +- tools/coreml/pip_package/.gitignore | 10 ++++ tools/coreml/pip_package/MANIFEST.in | 5 ++ tools/coreml/pip_package/README.rst | 44 ++++++++++++++++ tools/coreml/pip_package/setup.py | 69 ++++++++++++++++++++++++++ tools/coreml/test/test_mxnet_image.py | 2 +- 8 files changed, 149 insertions(+), 23 deletions(-) rename tools/coreml/{ => converter}/utils.py (100%) create mode 100644 tools/coreml/pip_package/.gitignore create mode 100644 tools/coreml/pip_package/MANIFEST.in create mode 100644 tools/coreml/pip_package/README.rst create mode 100644 tools/coreml/pip_package/setup.py diff --git a/tools/coreml/README.md b/tools/coreml/README.md index e29eebe84bc1..45f19b608bdb 100644 --- a/tools/coreml/README.md +++ b/tools/coreml/README.md @@ -3,22 +3,23 @@ This tool helps convert MXNet models into [Apple CoreML](https://developer.apple.com/documentation/coreml) format which can then be run on Apple devices. ## Installation -In order to use this tool you need to have these installed: -* MacOS - High Sierra 10.13 -* Xcode 9 -* coremltools 0.5.0 or greater (pip install coremltools) -* mxnet 0.10.0 or greater. [Installation instructions](http://mxnet.io/get_started/install.html). -* yaml (pip install pyyaml) +In order to use this tool you need to have these: +* MacOS - 10.11 (El Capitan) or higher (for running inferences on the converted model MacOS 10.13 or higher (for phones: iOS 11 or above) is needed) * python 2.7 +* mxnet-to-coreml tool: + +```bash +pip install mxnet-to-coreml +``` ## How to use -Let's say you want to use your MXNet model in an iPhone App. For the purpose of this example, let's say you want to use squeezenet-v1.1. +Let's say you want to use your MXNet model in an iPhone App. For the purpose of this example, let's assume it is a squeezenet-v1.1 model. -1. Download the model into the directory where this converter resides. Squeezenet can be downloaded from [here](http://data.mxnet.io/models/imagenet/squeezenet/). +1. Download the model into the directory where this converter resides. Squeezenet can be downloaded from [here](http://data.mxnet.io/models/imagenet/squeezenet/). The synset.txt file which contains all the class-labels and can be downloaded from [here](http://data.mxnet.io/models/imagenet/synset.txt). 2. Run this command: ```bash -python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel" +mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="squeezenetv11.mlmodel" ``` The above command will save the converted model in CoreML format to file squeezenet-v11.mlmodel. Internally, the model is first loaded by MXNet recreating the entire symbolic graph in memory. The converter walks through this symbolic graph converting each operator into its CoreML equivalent. Some of the supplied arguments to the converter are used by MXNet to generate the graph while others are used by CoreML either to pre-process the input (before passing it to the neural network) or to process the output of the neural network in a particular way. @@ -40,20 +41,20 @@ python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --in You could provide a file containing class labels (as above) so that CoreML will return the category a given image belongs to. The file should have a label per line and labels can have any special characters. The line number of the label in the file should correspond with the index of softmax output. E.g. ```bash -python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel" +mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --class-labels synset.txt --output-file="squeezenetv11.mlmodel" ``` ### Adding a pre-processing layer to CoreML model. You could ask CoreML to pre-process the images before passing them through the model. The following command provides image re-centering parameters for red, blue and green channel. ```bash -python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103}' --output-file="squeezenet_v11.mlmodel" +mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103}' --output-file="squeezenet_v11.mlmodel" ``` If you are building an app for a model that takes "Image" as an input, you will have to provide image_input_names as pre-processing arguments. This tells CoreML that a particular input variable is of type Image. E.g.: ```bash -python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103,"image_input_names":"data"}' --output-file="squeezenet_v11.mlmodel" +mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103,"image_input_names":"data"}' --output-file="squeezenet_v11.mlmodel" ``` ## Currently supported @@ -79,36 +80,32 @@ Any MXNet model that uses the above operators can be converted easily. For insta 1. [Inception-BN](http://data.mxnet.io/models/imagenet/inception-bn/) ```bash -python mxnet_coreml_converter.py --model-prefix='Inception-BN' --epoch=126 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="InceptionBN.mlmodel" +mxnet_coreml_converter.py --model-prefix='Inception-BN' --epoch=126 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="InceptionBN.mlmodel" ``` 2. [NiN](http://data.dmlc.ml/models/imagenet/nin/) ```bash -python mxnet_coreml_converter.py --model-prefix='nin' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="nin.mlmodel" +mxnet_coreml_converter.py --model-prefix='nin' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="nin.mlmodel" ``` 3. [Resnet](http://data.mxnet.io/models/imagenet/resnet/) ```bash -python mxnet_coreml_converter.py --model-prefix='resnet-50' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="resnet50.mlmodel" +mxnet_coreml_converter.py --model-prefix='resnet-50' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="resnet50.mlmodel" ``` 4. [Squeezenet](http://data.mxnet.io/models/imagenet/squeezenet/) ```bash -python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel" +mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="squeezenetv11.mlmodel" ``` 5. [Vgg](http://data.mxnet.io/models/imagenet/vgg/) ```bash -python mxnet_coreml_converter.py --model-prefix='vgg16' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="vgg16.mlmodel" +mxnet_coreml_converter.py --model-prefix='vgg16' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="vgg16.mlmodel" ``` ## Known issues * [Inception-V3](http://data.mxnet.io/models/imagenet/inception-v3.tar.gz) model can be converted into CoreML format but is unable to run on Xcode. - -## This tool has been tested with: -* MacOS - High Sierra 10.13 Beta. -* Xcode 9 beta 5. diff --git a/tools/coreml/utils.py b/tools/coreml/converter/utils.py similarity index 100% rename from tools/coreml/utils.py rename to tools/coreml/converter/utils.py diff --git a/tools/coreml/mxnet_coreml_converter.py b/tools/coreml/mxnet_coreml_converter.py index 502377eca864..ffa5008b3db4 100644 --- a/tools/coreml/mxnet_coreml_converter.py +++ b/tools/coreml/mxnet_coreml_converter.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -18,7 +19,7 @@ from __future__ import print_function import argparse from converter._mxnet_converter import convert -from utils import load_model +from converter.utils import load_model import yaml from ast import literal_eval diff --git a/tools/coreml/pip_package/.gitignore b/tools/coreml/pip_package/.gitignore new file mode 100644 index 000000000000..7c67bf467970 --- /dev/null +++ b/tools/coreml/pip_package/.gitignore @@ -0,0 +1,10 @@ +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +/dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info +/*.egg + diff --git a/tools/coreml/pip_package/MANIFEST.in b/tools/coreml/pip_package/MANIFEST.in new file mode 100644 index 000000000000..6ecd97d57dc7 --- /dev/null +++ b/tools/coreml/pip_package/MANIFEST.in @@ -0,0 +1,5 @@ +# Include the license file +include LICENSE.txt + +# Documentation for pypi webpage +include README.rst diff --git a/tools/coreml/pip_package/README.rst b/tools/coreml/pip_package/README.rst new file mode 100644 index 000000000000..875d89fcd208 --- /dev/null +++ b/tools/coreml/pip_package/README.rst @@ -0,0 +1,44 @@ +MXNET -> CoreML Converter +========================= + +`Apache MXNet `_ (incubating) is a deep learning framework designed for both efficiency and flexibility. It allows you to mix `symbolic and imperative programming `_ to maximize efficiency and productivity. At its core, MXNet contains a dynamic dependency scheduler that automatically parallelizes both symbolic and imperative operations on the fly. A graph optimization layer on top of that makes symbolic execution fast and memory efficient. MXNet is portable and lightweight, scaling effectively to multiple GPUs and multiple machines. + +`Core ML `_ is an Apple framework which allows developers to simply and easily integrate machine learning (ML) models into apps running on Apple devices (including iOS, watchOS, macOS, and tvOS). Core ML introduces a public file format (.mlmodel) for a broad set of ML methods including deep neural networks (both convolutional and recurrent), tree ensembles with boosting, and generalized linear models. Models in this format can be directly integrated into apps through Xcode. + +This tool helps convert `MXNet models `_ into `Apple CoreML `_ format which can then be run on Apple devices. You can find more information about this tool on our `github `_ page. + +Prerequisites +------------- +This package can only be installed on MacOS X since it relies on Apple's CoreML SDK. It can be run on MacOS 10.11 or higher though for running inferences on the converted model MacOS 10.13 or higher is needed (or for phones, iOS 11 or above). + +Installation +------------ +The method for installing this tool follows the `standard python package installation steps `_. Once you have set up a python environment, run:: + + pip install mxnet-to-coreml + +The package `documentation `_ contains more details on how to use coremltools. + +Dependencies +------------ +This tool has the following dependencies: + +* mxnet (0.10.0+) +* coremltools (0.5.1+) +* pyyaml (3.12+) + +Sample Usage +------------ + +In order to convert, say a `Squeezenet model `_, with labels from `synset.txt `_, execute this :: + + mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' \ + --epoch=0 --input-shape='{"data":"3,227,227"}' \ + --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' \ + --class-labels synset.txt --output-file="squeezenetv11.mlmodel" + +More Information +---------------- +* `On Github `_ +* `MXNet framework `_ +* `Apple CoreML `_ diff --git a/tools/coreml/pip_package/setup.py b/tools/coreml/pip_package/setup.py new file mode 100644 index 000000000000..18c601d38166 --- /dev/null +++ b/tools/coreml/pip_package/setup.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from setuptools import setup +from setuptools import find_packages + +# We are overriding the default behavior of bdist_wheel which is generating +# pure python wheels while we need platform specific wheel since this tool +# can only work on MacOS. +try: + from wheel.bdist_wheel import bdist_wheel as _bdist_wheel + class bdist_wheel(_bdist_wheel): + def finalize_options(self): + _bdist_wheel.finalize_options(self) + self.root_is_pure = False +except ImportError: + bdist_wheel = None + + +def readme(): + """ + Reads README.rst file and allows us to provide + a better experience for pypi webpage. + """ + with open('README.rst') as f: + return f.read() + +setup(name='mxnet-to-coreml', + version='0.1.0', + description='Tool to convert MXNet models into Apple CoreML model format.', + long_description=readme(), + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: MacOS :: MacOS X', + 'Programming Language :: Python :: 2.7', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], + keywords='Apache MXNet Apple CoreML Converter Deep Learning', + url='https://github.com/apache/incubator-mxnet/tree/master/tools/coreml', + author='pracheer', + author_email='pracheer_gupta@hotmail.com', + license='Apache 2.0', + package_dir = {'': '..'}, + packages=['converter'], + install_requires=[ + 'mxnet', + 'coremltools', + 'pyyaml', + ], + scripts=['../mxnet_coreml_converter.py'], + python_requires='~=2.7', + zip_safe=False, + cmdclass={'bdist_wheel': bdist_wheel},) diff --git a/tools/coreml/test/test_mxnet_image.py b/tools/coreml/test/test_mxnet_image.py index ac30ac7f5ad9..2bbf7b1e264b 100644 --- a/tools/coreml/test/test_mxnet_image.py +++ b/tools/coreml/test/test_mxnet_image.py @@ -24,7 +24,7 @@ sys.path.append(current_working_directory + "/..") sys.path.append(current_working_directory + "/../converter/") import _mxnet_converter as mxnet_converter -from utils import load_model +from converter.utils import load_model VAL_DATA = 'data/val-5k-256.rec' From 12b244dae7957715ca4bb77d76448c744b7730c4 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 28 Aug 2017 17:31:45 -0700 Subject: [PATCH 056/448] Parallelize windows unit tests of python 2 and 3 in jenkins (#7646) * parallelize python windows tests * reordered for clarity --- Jenkinsfile | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ac34e71a53f1..fe0151a879d6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -342,7 +342,7 @@ try { } } }, - 'Python2/3: CPU Win':{ + 'Python 2: CPU Win':{ node('mxnetwindows') { ws('workspace/ut-python-cpu') { init_git_win() @@ -351,20 +351,30 @@ try { 7z x -y vc14_cpu.7z''' bat """xcopy C:\\mxnet\\data data /E /I /Y xcopy C:\\mxnet\\model model /E /I /Y - call activate py3 + call activate py2 set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc C:\\mxnet\\test_cpu.bat""" - bat """xcopy C:\\mxnet\\data data /E /I /Y + } + } + }, + 'Python 3: CPU Win': { + node('mxnetwindows') { + ws('workspace/ut-python-cpu') { + init_git_win() + unstash 'vc14_cpu' + bat '''rmdir /s/q pkg_vc14_cpu + 7z x -y vc14_cpu.7z''' + bat """xcopy C:\\mxnet\\data data /E /I /Y xcopy C:\\mxnet\\model model /E /I /Y - call activate py2 + call activate py3 set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc C:\\mxnet\\test_cpu.bat""" } } }, - 'Python2/3: GPU Win':{ + 'Python 2: GPU Win':{ node('mxnetwindows') { ws('workspace/ut-python-gpu') { init_git_win() @@ -373,19 +383,29 @@ try { 7z x -y vc14_gpu.7z''' bat """xcopy C:\\mxnet\\data data /E /I /Y xcopy C:\\mxnet\\model model /E /I /Y - call activate py3 + call activate py2 set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc C:\\mxnet\\test_gpu.bat""" + } + } + }, + 'Python 3: GPU Win':{ + node('mxnetwindows') { + ws('workspace/ut-python-gpu') { + init_git_win() + unstash 'vc14_gpu' + bat '''rmdir /s/q pkg_vc14_gpu + 7z x -y vc14_gpu.7z''' bat """xcopy C:\\mxnet\\data data /E /I /Y xcopy C:\\mxnet\\model model /E /I /Y - call activate py2 + call activate py3 set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc C:\\mxnet\\test_gpu.bat""" } } - } + } } stage('Integration Test') { From 03b1d8de0959bcfe91bf3279660f50a0248021b9 Mon Sep 17 00:00:00 2001 From: Hagay Lupesko Date: Mon, 28 Aug 2017 20:08:50 -0700 Subject: [PATCH 057/448] Removed asset loaded insecurely and added the asset to be loaded from the origin securely (#7649) --- docs/_static/mxnet-theme/navbar.html | 45 +-------------------------- docs/_static/mxnet.png | Bin 0 -> 67645 bytes 2 files changed, 1 insertion(+), 44 deletions(-) create mode 100644 docs/_static/mxnet.png diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html index c88fb58bb5c2..0d49eeb4dc89 100644 --- a/docs/_static/mxnet-theme/navbar.html +++ b/docs/_static/mxnet-theme/navbar.html @@ -1,51 +1,8 @@ - -