diff --git a/Makefile b/Makefile index e821c6faa5dd..5a3886035ab7 100644 --- a/Makefile +++ b/Makefile @@ -40,11 +40,11 @@ endif # use customized config file include $(config) -ifeq ($(USE_MKL2017), 1) -# must run ./prepare_mkl before including mshadow.mk - RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT)) - MKLROOT := $(firstword $(RETURN_STRING)) - export USE_MKLML = $(lastword $(RETURN_STRING)) +ifeq ($(USE_MKLDNN), 1) + RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT)) + MKLDNNROOT := $(firstword $(RETURN_STRING)) + MKLROOT := $(lastword $(RETURN_STRING)) + export USE_MKLML = 1 endif include mshadow/make/mshadow.mk @@ -112,23 +112,16 @@ ifeq ($(USE_NNPACK), 1) LDFLAGS += -lnnpack endif -ifeq ($(USE_MKL2017), 1) - CFLAGS += -DMXNET_USE_MKL2017=1 +ifeq ($(USE_MKLDNN), 1) + CFLAGS += -DMXNET_USE_MKLDNN=1 CFLAGS += -DUSE_MKL=1 - CFLAGS += -I$(ROOTDIR)/src/operator/mkl/ - CFLAGS += -I$(MKLML_ROOT)/include - LDFLAGS += -L$(MKLML_ROOT)/lib - ifeq ($(USE_MKL2017_EXPERIMENTAL), 1) - CFLAGS += -DMKL_EXPERIMENTAL=1 - else - CFLAGS += -DMKL_EXPERIMENTAL=0 - endif - ifeq ($(UNAME_S), Darwin) - LDFLAGS += -lmklml - else - LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu + CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/ + ifneq ($(MKLDNNROOT), $(MKLROOT)) + CFLAGS += -I$(MKLROOT)/include + LDFLAGS += -L$(MKLROOT)/lib endif - LDFLAGS += -liomp5 + CFLAGS += -I$(MKLDNNROOT)/include + LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn endif # verify existence of separate lapack library when using blas/openblas/atlas @@ -138,7 +131,7 @@ endif # - for Ubuntu, installing atlas will not automatically install the atlas provided lapack library # silently switching lapack off instead of letting the build fail because of backward compatibility ifeq ($(USE_LAPACK), 1) -ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) +ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) ifeq (,$(wildcard /lib/liblapack.a)) ifeq (,$(wildcard /usr/lib/liblapack.a)) ifeq (,$(wildcard $(USE_LAPACK_PATH)/liblapack.a)) @@ -154,7 +147,7 @@ ifeq ($(USE_LAPACK), 1) ifneq ($(USE_LAPACK_PATH), ) LDFLAGS += -L$(USE_LAPACK_PATH) endif - ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) + ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) LDFLAGS += -llapack endif CFLAGS += -DMXNET_USE_LAPACK @@ -280,9 +273,9 @@ endif all: lib/libmxnet.a lib/libmxnet.so $(BIN) extra-packages -SRC = $(wildcard src/*/*/*.cc src/*/*.cc src/*.cc) +SRC = $(wildcard src/*/*/*/*.cc src/*/*/*.cc src/*/*.cc src/*.cc) OBJ = $(patsubst %.cc, build/%.o, $(SRC)) -CUSRC = $(wildcard src/*/*/*.cu src/*/*.cu src/*.cu) +CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu) CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC)) # extra operators @@ -521,7 +514,8 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN) else clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN) $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \ - R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz + R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \ + external/mkldnn/install/* cd $(DMLC_CORE); $(MAKE) clean; cd - cd $(PS_PATH); $(MAKE) clean; cd - cd $(NNVM_PATH); $(MAKE) clean; cd - diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 84ee9fa5e4d4..e37896e26695 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -34,12 +34,12 @@ #include #include #include +#if MXNET_USE_MKLDNN == 1 +#include +#endif #include "./base.h" #include "./storage.h" #include "./engine.h" -#if MKL_EXPERIMENTAL == 1 -#include -#endif // check c++11 #if DMLC_USE_CXX11 == 0 #error "cxx11 was required for ndarray module" @@ -60,8 +60,12 @@ enum NDArrayStorageType { kDefaultStorage, // dense kRowSparseStorage, // row sparse kCSRStorage, // csr +#if MXNET_USE_MKLDNN == 1 + kMKLDNNStorage, // MKLDNN +#endif }; +class MKLDNNMemory; /*! * \brief ndarray interface @@ -70,9 +74,6 @@ class NDArray { public: /*! \brief default constructor */ NDArray() { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = MKLMemHolder::create(); -#endif } /*! * \brief constructs a new dynamic NDArray @@ -86,56 +87,14 @@ class NDArray { : ptr_(std::make_shared(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! \brief constructor for NDArray with storage type */ NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, bool delay_alloc = true, int dtype = mshadow::default_type_flag, std::vector aux_types = {}, std::vector aux_shapes = {}, - TShape storage_shape = TShape(mshadow::Shape1(0))) - : shape_(shape), dtype_(dtype), storage_type_(stype), - entry_({nullptr, 0, 0}) { - // Assign default aux types if not given - if (aux_types.size() == 0) { - if (stype == kRowSparseStorage) { - aux_types = {mshadow::kInt64}; - } else if (stype == kCSRStorage) { - aux_types = {mshadow::kInt64, mshadow::kInt64}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - // Assign default shapes if not given - // unknown shapes are intialized as {0} such that Size() would return 0 - if (aux_shapes.size() == 0) { - if (stype == kRowSparseStorage) { - aux_shapes = {TShape(mshadow::Shape1(0))}; - } else if (stype == kCSRStorage) { - // aux shapes for indptr and indices - aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - if (storage_shape.Size() == 0) { - if (stype == kRowSparseStorage) { - storage_shape = shape; - storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; - } else if (stype == kCSRStorage) { - storage_shape = aux_shapes[csr::kIdx]; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, - dtype, aux_types, aux_shapes); -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif - } + TShape storage_shape = TShape(mshadow::Shape1(0))); + /*! * \brief constructing a static NDArray that shares data with TBlob * Use with caution: allocate ONLY ONE NDArray for each TBlob, @@ -147,9 +106,6 @@ class NDArray { : ptr_(std::make_shared(data, dev_id)), shape_(data.shape_), dtype_(data.type_flag_), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! @@ -166,9 +122,6 @@ class NDArray { const TBlob &data, const std::vector &aux_data, int dev_id) : ptr_(std::make_shared(stype, data, aux_data, dev_id)), shape_(shape), dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } @@ -253,9 +206,6 @@ class NDArray { << "Unexpected storage type: " << stype; res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); }); -#if MKL_EXPERIMENTAL == 1 - res.Mkl_mem_ = Mkl_mem_; -#endif return res; } /*! @@ -497,12 +447,6 @@ class NDArray { CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - // convert prv to cpu - Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr); - } -#endif NDArray ret = *this; ret.shape_ = shape; ret.dtype_ = dtype; @@ -574,6 +518,31 @@ class NDArray { << "CheckAndAllocAuxData is not intended for kDefaultStorage"; ptr_->CheckAndAllocAuxData(i, aux_shape); } + +#if MXNET_USE_MKLDNN == 1 + /* + * This function returns mkldnn::memory with the default primitive_desc. + */ + std::shared_ptr GetMKLDNNData() const; + /* + * This function returns mkldnn::memory with the given primitive_desc + * as long as the array size meets the required size in the given primitive_desc. + */ + std::shared_ptr GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc) const; + /* + * This function returns mkldnn::memory with the given primitive_desc. + * The returned mkldnn::memory will have the same physical layout as + * the given primitive_desc. + */ + std::shared_ptr GetMKLDNNDataReorder( + const mkldnn::memory::primitive_desc &desc) const; + + void CopyFrom(const mkldnn::memory &mem); + std::shared_ptr CreateMKLDNNData( + const mkldnn::memory::primitive_desc &desc); +#endif + /*! * \brief Save list of ndarray into the Stream.x * \param fo The stream of output. @@ -608,6 +577,12 @@ class NDArray { for csr, aux_handles[0] = indptr, aux_handles[1] = indices */ std::vector aux_handles; + +#if MXNET_USE_MKLDNN == 1 + /*! This is created when data is stored in MKLDNN format. + */ + std::shared_ptr Mkl_mem_; +#endif /*! \brief variable from engine */ Engine::VarHandle var; /*! @@ -774,20 +749,14 @@ class NDArray { // storage shape is also updated // if data is already allocated, try reuse the storage. Otherwise, free the current one // and allocate new storage - inline void CheckAndAllocData(const TShape &shape, int dtype) { - CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); - } - // init shape - storage_shape = shape; - // delay_alloc is only set when data storage handle is present - delay_alloc = false; - } + void CheckAndAllocData(const TShape &shape, int dtype); + +#if MXNET_USE_MKLDNN == 1 + // Have MKL memory reference to the data in the default storage + // or create memory for MKLDNN. + void SetMKLMem(const TShape &shape, int dtype); +#endif + // create storage handle for aux data based on shape // this function assumes ctx, aux shapes and aux types are set // aux shape is also updated @@ -828,30 +797,8 @@ class NDArray { } }; // struct Chunk - void SetTBlob() const { - CHECK(ptr_ != nullptr); - TShape shape = shape_; - char *dptr = static_cast(ptr_->shandle.dptr); - auto stype = storage_type(); - if (stype == kDefaultStorage) { - dptr += byte_offset_; - } else if (stype == kCSRStorage || stype == kRowSparseStorage) { - shape = storage_shape(); - } else { - LOG(FATAL) << "unknown storage type " << stype; - } - tblob_.dptr_ = dptr; - tblob_.shape_ = shape; - tblob_.type_flag_ = dtype_; - tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); -#if MKL_EXPERIMENTAL == 1 - tblob_.Mkl_mem_ = Mkl_mem_; -#endif - } + void SetTBlob() const; -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr Mkl_mem_; -#endif /*! \brief internal data of NDArray */ std::shared_ptr ptr_{nullptr}; /*! \brief shape of current NDArray */ diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h index 18bf4fa780d9..876d8acc2dc3 100755 --- a/include/mxnet/tensor_blob.h +++ b/include/mxnet/tensor_blob.h @@ -35,9 +35,6 @@ #include #include #include "./base.h" -#if MXNET_USE_MKL2017 == 1 -#include -#endif namespace mxnet { /* Forward declaration for friend declaration in TBlob */ @@ -65,17 +62,10 @@ class TBlob { /*! \brief type flag of the tensor blob */ int type_flag_; - /*! \brief storing mkl chunk buffer blob, use for experimental only */ -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr Mkl_mem_; -#endif /*! \brief default constructor, default copy assign will work */ TBlob(void) : dptr_(NULL), type_flag_(mshadow::DataType::kFlag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(cpu::kDevMask, 0); } /*! @@ -89,9 +79,6 @@ class TBlob { TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(mshadow::DataType::kFlag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -104,9 +91,6 @@ class TBlob { */ TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(type_flag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -134,9 +118,6 @@ class TBlob { shape_ = src.shape_; type_flag_ = mshadow::DataType::kFlag; SetDLTensor(Device::kDevMask, -1); -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif return *this; } /*! @@ -171,11 +152,6 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - Mkl_mem_->check_and_prv_to_cpu(dptr_); - } -#endif return mshadow::Tensor(static_cast(dptr_), shape_.FlatTo2D(), shape_[shape_.ndim() - 1], @@ -216,11 +192,6 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - Mkl_mem_->check_and_prv_to_cpu(dptr_); - } -#endif return static_cast(dptr_); } /*! \brief device mask of the corresponding device */ diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh new file mode 100755 index 000000000000..7a4fe4ce5207 --- /dev/null +++ b/prepare_mkldnn.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# set -ex +# +# All modification made by Intel Corporation: © 2016 Intel Corporation +# +# All contributions by the University of California: +# Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +# All rights reserved. +# +# All other contributions: +# Copyright (c) 2014, 2015, the respective contributors +# All rights reserved. +# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md +# +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Intel Corporation nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +MXNET_ROOTDIR="$(pwd)" +MKLDNN_ROOTDIR="$MXNET_ROOTDIR/external/mkldnn" +MKLDNN_GITHUB="https://github.com/01org/mkl-dnn.git" +MKLDNN_TMPDIR="$MKLDNN_ROOTDIR/tmp" +MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src" +MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build" +MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install" + +# MKL DNN release tag, or commit. +MKLDNN_COMMIT="v0.11" + +# MKLDNN install destination +HOME_MKLDNN=$1 +if [ ! -z "$HOME_MKLDNN" ]; then + mkdir -p $HOME_MKLDNN + if [ ! -w $HOME_MKLDNN ]; then + echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2 + exit 1 + fi +fi + +if [ -z $MKLDNNROOT ]; then +if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then + mkdir -p $MKLDNN_INSTALLDIR + if [ ! -d $MKLDNN_SRCDIR/.git ]; then + echo "Downloading MKLDNN ..." >&2 + rm -rf $MKLDNN_SRCDIR + git clone --quiet --no-checkout $MKLDNN_GITHUB $MKLDNN_TMPDIR + rsync -a $MKLDNN_TMPDIR/ $MKLDNN_SRCDIR && rm -rf $MKLDNN_TMPDIR + fi + cd $MKLDNN_SRCDIR && git fetch --all && git reset --hard $MKLDNN_COMMIT + if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then + rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. + cp -a external/*/* $MKLDNN_INSTALLDIR/. + fi + echo "Building MKLDNN ..." >&2 + cd $MXNET_ROOTDIR + cmake $MKLDNN_SRCDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR + make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) + make -C $MKLDNN_BUILDDIR install + rm -rf $MKLDNN_BUILDDIR +fi +MKLDNNROOT=$MKLDNN_INSTALLDIR +fi + +if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then + MKLROOT=$MKLDNNROOT; +fi + +# user specified MKLDNN install folder +if [ -d "$HOME_MKLDNN" ]; then + # skip if user specificed MKLDNNROOT + [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/. + [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/. + # update ldconfig if possible + if [ -w /etc/ld.so.conf.d ]; then + echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig + fi +# return value to calling script (Makefile,cmake) + echo $HOME_MKLDNN $HOME_MKLDNN +else + echo $MKLDNNROOT $MKLROOT +fi + diff --git a/python/mxnet/ndarray/mkldnn.py b/python/mxnet/ndarray/mkldnn.py new file mode 100644 index 000000000000..e90fd77a34db --- /dev/null +++ b/python/mxnet/ndarray/mkldnn.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-wildcard-import, too-many-lines + +"""MKLDNN NDArray API of MXNet.""" + +from __future__ import absolute_import +from __future__ import division +try: + from __builtin__ import slice as py_slice + from __builtin__ import sum as py_sum +except ImportError: + from builtins import slice as py_slice + from builtins import sum as py_sum + +import ctypes +import warnings + +__all__ = ["_ndarray_cls", "MKLNDArray"] + +import numpy as np +from ..base import _LIB, numeric_types +from ..base import c_array, mx_real_t, integer_types +from ..base import mx_uint, NDArrayHandle, check_call +from ..context import Context +from . import _internal +from . import op +from ._internal import _set_ndarray_class +from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP +from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_MKLDNN +from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT +from .ndarray import zeros as _zeros_ndarray +from .ndarray import array as _array + +class MKLNDArray(NDArray): + """The base class of an NDArray stored in a MKLDNN storage format. + """ + + def __repr__(self): + """Returns a string representation of the sparse array.""" + shape_info = 'x'.join(['%d' % x for x in self.shape]) + # The data content is not displayed since the array usually has big shape + return '\n<%s %s @%s>' % (self.__class__.__name__, + shape_info, self.context) + + # TODO + def _at(self, idx): + raise NotSupportedForMKLNDArray(self._at, '[idx]', idx) + + def _slice(self, start, stop): + return op.slice(self, begin=start, end=stop) + + # TODO + def astype(self, dtype): + """Returns a copy of the array after casting to a specified type. + Parameters + ---------- + dtype : numpy.dtype or str + The type of the returned array. + Examples + -------- + >>> x = mx.nd.sparse.zeros('row_sparse', (2,3), dtype='float32') + >>> y = x.astype('int32') + >>> y.dtype + + """ + res = zeros(shape=self.shape, ctx=self.context, + dtype=dtype, stype=self.stype) + self.copyto(res) + return res + + # TODO + def copyto(self, other): + """Copies the value of this array to another array. + + Parameters + ---------- + other : NDArray or CSRNDArray or RowSparseNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or CSRNDArray or RowSparseNDArray + The copied array. + """ + if isinstance(other, NDArray): + if other.handle is self.handle: + warnings.warn('You are attempting to copy an array to itself', RuntimeWarning) + return + return _internal._copyto(self, out=other) + elif isinstance(other, Context): + hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other, + True, self.dtype, self._aux_types)) + return _internal._copyto(self, out=hret) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py index 1cd9f40e520d..aa397a98165f 100644 --- a/python/mxnet/ndarray/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -51,6 +51,7 @@ _STORAGE_TYPE_DEFAULT = 0 _STORAGE_TYPE_ROW_SPARSE = 1 _STORAGE_TYPE_CSR = 2 +_STORAGE_TYPE_MKLDNN = 3 # pylint: disable= no-member _DTYPE_NP_TO_MX = { diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py index a1a3ba83b4ba..070db90b5832 100644 --- a/python/mxnet/ndarray/sparse.py +++ b/python/mxnet/ndarray/sparse.py @@ -48,7 +48,8 @@ pass from ._internal import _set_ndarray_class from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP -from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR +from .mkldnn import MKLNDArray +from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR, _STORAGE_TYPE_MKLDNN from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT from .ndarray import zeros as _zeros_ndarray from .ndarray import array as _array @@ -1038,6 +1039,8 @@ def _ndarray_cls(handle, writable=True, stype=_STORAGE_TYPE_UNDEFINED): stype = _storage_type(handle) if stype == _STORAGE_TYPE_DEFAULT: return NDArray(handle, writable=writable) + elif stype == _STORAGE_TYPE_MKLDNN: + return MKLNDArray(handle, writable=False) elif stype == _STORAGE_TYPE_CSR: return CSRNDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_ROW_SPARSE: diff --git a/src/common/utils.cc b/src/common/utils.cc index 125e4e5dc7d7..b3c34ea63e42 100644 --- a/src/common/utils.cc +++ b/src/common/utils.cc @@ -35,5 +35,21 @@ void CastStorageDispatch(const OpContext& ctx, mxnet::op::CastStorageComputeImpl(ctx, input, output); } +std::string stype_string(const int x) { + switch (x) { + case kDefaultStorage: + return "default"; + case kCSRStorage: + return "csr"; + case kRowSparseStorage: + return "row_sparse"; +#if MXNET_USE_MKLDNN == 1 + case kMKLDNNStorage: + return "mkldnn"; +#endif + } + return "unknown"; +} + } // namespace common } // namespace mxnet diff --git a/src/common/utils.h b/src/common/utils.h index e0604de88ac3..0b1f9610a6f5 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -158,17 +158,7 @@ inline std::string dispatch_mode_string(const DispatchMode x) { /*! \brief get string representation of storage_type */ -inline std::string stype_string(const int x) { - switch (x) { - case kDefaultStorage: - return "default"; - case kCSRStorage: - return "csr"; - case kRowSparseStorage: - return "row_sparse"; - } - return "unknown"; -} +std::string stype_string(const int x); // heuristic to dermine number of threads per GPU inline int GetNumThreadPerGPU() { diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index f595b446848e..67c9693f631e 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -29,11 +29,7 @@ #include "../common/utils.h" #include "../common/exec_utils.h" #include "./exec_pass.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../operator/mkl/mkl_memory-inl.h" -#include "../operator/mkl/mkl_util-inl.h" -#endif + namespace mxnet { namespace op { @@ -105,10 +101,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(state_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); -#if MKL_EXPERIMENTAL == 1 - mkl_tblobs_prv_to_cpu(in_data_); - mkl_tblobs_prv_to_cpu(out_data_); -#endif } ExecType exec_type() const override { @@ -174,10 +166,6 @@ class FComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(attrs_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); -#if MKL_EXPERIMENTAL == 1 - mkl_tblobs_prv_to_cpu(in_data_); - mkl_tblobs_prv_to_cpu(out_data_); -#endif } ExecType exec_type() const override { diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 5e62be8c4c40..65034b86dc5a 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -31,11 +31,6 @@ #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" -#if MKL_EXPERIMENTAL == 1 -#include -#include "../operator/mkl/mkl_memory-inl.h" -#include "../operator/mkl/mkl_util-inl.h" -#endif namespace mxnet { namespace kvstore { @@ -182,9 +177,6 @@ class KVStoreDist : public KVStoreLocal { // convert to ps keys size_t size = recv_buf.shape().Size(); PSKV& pskv = EncodeKey(key, size); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif real_t* data = recv_buf.data().dptr(); // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); @@ -292,9 +284,6 @@ class KVStoreDist : public KVStoreLocal { size_t size = send_buf.shape().Size(); PSKV& pskv = EncodeKey(key, size); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif real_t* data = send_buf.data().dptr(); // do push. false means no delete ps::SArray vals(data, size, false); @@ -326,9 +315,6 @@ class KVStoreDist : public KVStoreLocal { // allocate memory for the buffer size_t num_rows = indices.shape().Size(); recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)}); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif real_t* data = recv_buf.data().dptr(); const auto offsets = indices.data().dptr(); const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim()); @@ -364,9 +350,6 @@ class KVStoreDist : public KVStoreLocal { using namespace rowsparse; auto push_to_servers = [this, key, send_buf] (RunContext rctx, Engine::CallbackOnComplete cb) { -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif real_t* data = send_buf.data().dptr(); const int64_t num_rows = send_buf.aux_shape(kIdx)[0]; const auto offsets = send_buf.aux_data(kIdx).dptr(); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index dd43338f6dfd..d16862966138 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -34,6 +34,7 @@ #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" +#include "../operator/nn/mkldnn/mkldnn_base-inl.h" #if MXNET_USE_OPENCV #include @@ -45,6 +46,80 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { +NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, + bool delay_alloc, int dtype, std::vector aux_types, + std::vector aux_shapes, TShape storage_shape) : shape_(shape), + dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) { + // Assign default aux types if not given + if (aux_types.size() == 0 +#if MXNET_USE_MKLDNN == 1 + && stype != kMKLDNNStorage +#endif + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + aux_types = {mshadow::kInt64}; + } else if (stype == kCSRStorage) { + aux_types = {mshadow::kInt64, mshadow::kInt64}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0 +#if MXNET_USE_MKLDNN == 1 + && stype != kMKLDNNStorage +#endif + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + aux_shapes = {TShape(mshadow::Shape1(0))}; + } else if (stype == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (storage_shape.Size() == 0 +#if MXNET_USE_MKLDNN == 1 + && stype != kMKLDNNStorage +#endif + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (stype == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); +} + +void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { +#if MXNET_USE_MKLDNN == 1 + if (storage_type == kMKLDNNStorage) { + SetMKLMem(shape, dtype); + return; + } +#endif + CHECK_NE(aux_shapes.size(), 0) + << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); + } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; +} + NDArray NDArray::grad() const { if (Imperative::AGInfo::IsNone(*this)) return NDArray(); Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node); @@ -63,17 +138,118 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { return ret; } +#if MXNET_USE_MKLDNN == 1 + +static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { + if (desc.data.ndims == 1) + return desc.data.format; + else if (desc.data.ndims == 2) { + if (desc.data.format == mkldnn_io) + return mkldnn_oi; + else + return desc.data.format; + } + else if (desc.data.ndims == 4) { + switch (desc.data.format) { + case mkldnn_nchw: + case mkldnn_nhwc: + case mkldnn_chwn: + case mkldnn_nChw8c: + case mkldnn_nChw16c: + return mkldnn_nchw; + case mkldnn_oihw: + case mkldnn_ihwo: + case mkldnn_hwio: + case mkldnn_OIhw8i8o: + case mkldnn_OIhw16i16o: + case mkldnn_OIhw8i16o2i: + case mkldnn_OIhw8o16i2o: + case mkldnn_OIhw8o8i: + case mkldnn_OIhw16o16i: + case mkldnn_IOhw16o16i: + case mkldnn_Oihw8o: + case mkldnn_Oihw16o: + case mkldnn_Ohwi8o: + case mkldnn_Ohwi16o: + case mkldnn_OhIw16o4i: + return mkldnn_oihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } + else if (desc.data.ndims == 5) { + switch (desc.data.format) { + case mkldnn_goihw: + case mkldnn_gOIhw8i8o: + case mkldnn_gOIhw16i16o: + case mkldnn_gOIhw8i16o2i: + case mkldnn_gOIhw8o16i2o: + case mkldnn_gOIhw8o8i: + case mkldnn_gOIhw16o16i: + case mkldnn_gIOhw16o16i: + case mkldnn_gOihw8o: + case mkldnn_gOihw16o: + case mkldnn_gOhwi8o: + case mkldnn_gOhwi16o: + case mkldnn_gOhIw16o4i: + return mkldnn_goihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } + else { + LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; + return mkldnn_format_undef; + } +} + +static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { + auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); + if (format == mem->get_primitive_desc().desc().data.format) + return mem; + + auto pd = mem->get_primitive_desc(); + mkldnn::memory::dims dims(pd.desc().data.ndims); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::format cpp_format = static_cast(format); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + mkldnn_mem_ptr def_mem(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + pd.get_engine()))); + + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterMem(def_mem); + stream.RegisterPrim(mkldnn::reorder(*mem, *def_mem)); + // TODO do I have to submit it here? + stream.Submit(); + return def_mem; +} + +#endif + NDArray NDArray::Reshape(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; - CHECK(storage_type() == kDefaultStorage) << "Reshape for storage type " << - storage_type() << " is not implemented yet"; - CHECK(storage_type() == kDefaultStorage) << "Reshape for storage type " << - storage_type() << " is not implemented yet"; CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape size is larger current shape"; - NDArray ret = this->Detach(); - ret.shape_ = shape; - return ret; + if (storage_type() == kDefaultStorage) { + NDArray ret = this->Detach(); + ret.shape_ = shape; + return ret; +#if MXNET_USE_MKLDNN == 1 + } else if (storage_type() == kMKLDNNStorage) { + NDArray ret = this->Detach(); + ret.shape_ = shape; + if (ret.ptr_->Mkl_mem_) + ret.ptr_->Mkl_mem_ = Reorder2Default(ret.ptr_->Mkl_mem_); + return ret; +#endif + } + LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; + return NDArray(); } NDArray NDArray::ReshapeWithRecord(const TShape &shape) { @@ -94,12 +270,34 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) { return ret; } - NDArray NDArray::Slice(index_t begin, index_t end) const { CHECK(!is_none()) << "NDArray is empty"; CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; +#if MXNET_USE_MKLDNN == 1 + CHECK(storage_type() == kDefaultStorage || storage_type() == kMKLDNNStorage); + if (storage_type() == kMKLDNNStorage) { + TShape new_shape = shape_; + new_shape[0] = end - begin; + NDArray ret(kMKLDNNStorage, new_shape, ctx(), ptr_->delay_alloc, dtype()); + size_t length = shape_.ProdShape(1, shape_.ndim()); + MSHADOW_TYPE_SWITCH(ret.dtype(), DType, { + ret.byte_offset_ += begin * length * sizeof(DType); + }); + + // We need to convert the MKL memory to the default layout. + Engine::Get()->PushSync([&](RunContext ctx) { + auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); + if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { + ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); + } + + }, ctx(), {this->var()}, {ret.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); + return ret; + } +#endif CHECK_EQ(storage_type(), kDefaultStorage); NDArray ret = this->Detach(); size_t length = shape_.ProdShape(1, shape_.ndim()); @@ -180,6 +378,222 @@ void NDArray::set_fresh_out_grad(bool state) const { info.fresh_out_grad = state; } +#if MXNET_USE_MKLDNN == 1 +static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { + if (shape.ndim() != ndims) + return false; + for (int i = 0; i < ndims; i++) + if (shape[i] != dims[i]) + return false; + return true; +} + +void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { + // The shape of the array and the one of the MKL memory may mismatch. + // For example, if the array stores parameters, the MKL memory may store data + // in 5 dimensions while the NDArray stores data in 4 dimensions. + // TODO is it possible that the MKL memory is out-of-date? + if (Mkl_mem_ && storage_type == kMKLDNNStorage) { + return; + } + + mkldnn::memory::dims dims; + // These are shapes supprted by MKLDNN. + if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4) { + dims.resize(shape.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = shape[i]; + } + // If there are 3 dimensions, we'll force it to 4 dimensions. + else if (shape.ndim() == 3) { + dims.resize(shape.ndim() + 1); + dims[0] = 1; + for (size_t i = 0; i < shape.ndim(); i++) + dims[i + 1] = shape[i]; + } + else + LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; + mkldnn::memory::format layout = mkldnn::memory::format::format_undef; + switch (dims.size()) { + case 1: layout = mkldnn::memory::format::x; break; + case 2: layout = mkldnn::memory::format::nc; break; + case 4: layout = mkldnn::memory::format::nchw; break; + } + mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; + auto cpu_engine = CpuEngine::Instance().get_engine(); + // If the storage type is the default type, we can just simply + // reference to the memory for the default storage. + if (storage_type == kDefaultStorage) { + Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + cpu_engine), shandle.dptr)); + } + // If the array uses MKLDNN storage, we need to allocate memory here. + else if (storage_type == kMKLDNNStorage) { + Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + cpu_engine))); + } +} + +static int GetTypeSize(int dtype) { + MSHADOW_TYPE_SWITCH(dtype, DType, { + return sizeof(DType); + }); + return -1; +} + +std::shared_ptr NDArray::GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc) const { + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + if (ptr_->storage_type == kDefaultStorage) { + ptr_->SetMKLMem(shape_, dtype_); + } + CHECK(ptr_->Mkl_mem_ != nullptr); + mkldnn::memory::primitive_desc _desc = desc; + auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); + auto desc2 = _desc.desc(); + // The MKL memory has the same format and shape as required, + // or both use the default format, we can return the MKL memory. + if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + return ptr_->Mkl_mem_; + } + else if (desc1.data.format == GetDefaultFormat(desc1) + && desc2.data.format == GetDefaultFormat(desc2)) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); + MKLDNNStream::Instance().RegisterMem(ret); + return ret; + } + else + return nullptr; +} + +std::shared_ptr NDArray::GetMKLDNNDataReorder( + const mkldnn::memory::primitive_desc &desc) const { + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + if (ptr_->storage_type == kDefaultStorage) { + ptr_->SetMKLMem(shape_, dtype_); + } + // If the array uses the default format, the MKL memory now references to + // the default storage. If it uses the MKLDNN format, the MKL memory should + // have been initialized since we are trying to get data from the array. + CHECK(ptr_->Mkl_mem_ != nullptr); + // If the memory descriptor matches, it's easy. + MKLDNNStream &stream = MKLDNNStream::Instance(); + // We need to make sure Mkl_mem_ is always valid as well. + stream.RegisterMem(ptr_->Mkl_mem_); + if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { + return ptr_->Mkl_mem_; + } + + mkldnn::memory::primitive_desc _desc = desc; + // Now we need to determine if we should reorder the memory. + // If both use the default formats, we think we don't need to reshape. + // TODO if the memory format isn't the default one, it may not work. + auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); + auto desc2 = _desc.desc(); + if (desc1.data.format == GetDefaultFormat(desc1) && + desc2.data.format == GetDefaultFormat(desc2)) { + mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); + stream.RegisterMem(ret); + return ret; + } + else { + // TODO we should manage the memory allocation here. + mkldnn_mem_ptr ret(new mkldnn::memory(desc)); + stream.RegisterMem(ret); + stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); + return ret; + } +} + +std::shared_ptr NDArray::GetMKLDNNData() const { + ptr_->SetMKLMem(shape_, dtype_); + if (ptr_->Mkl_mem_) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + return ptr_->Mkl_mem_; + } + else + // TODO We don't support converting sparse format. + return nullptr; +} + +void NDArray::CopyFrom(const mkldnn::memory &mem) { + if (ptr_ == nullptr) { + LOG(FATAL) << "The NDArray hasn't been initialized"; + return; + } + if (ptr_->Mkl_mem_.get() == &mem) + return; + + // TODO if the shape mismatches. + ptr_->SetMKLMem(shape_, dtype_); + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); +} + +std::shared_ptr NDArray::CreateMKLDNNData( + const mkldnn::memory::primitive_desc &desc) { + mkldnn::memory::primitive_desc _desc = desc; + auto required_format = _desc.desc().data.format; + auto def_format = GetDefaultFormat(_desc.desc()); + if (storage_type() != kMKLDNNStorage && required_format != def_format) + return nullptr; + + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + + // If the required format is a default format, we don't need to worry about the shape. + // If the shape isn't the same, it actually implicitly reshapes data. + if (required_format == def_format) { + ptr_->SetMKLMem(shape_, dtype_); + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + return ptr_->Mkl_mem_; + } + + if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + return ptr_->Mkl_mem_; + } + + ptr_->Mkl_mem_ = CreateMKLDNNMem(desc); + return ptr_->Mkl_mem_; +} +#endif + +void NDArray::SetTBlob() const { + CHECK(ptr_ != nullptr); + TShape shape = shape_; + char *dptr = static_cast(ptr_->shandle.dptr); + auto stype = storage_type(); + if (stype == kDefaultStorage) { + dptr += byte_offset_; + } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + CHECK_EQ(byte_offset_, 0); + shape = storage_shape(); +#if MXNET_USE_MKLDNN == 1 + } else if (stype == kMKLDNNStorage) { + if (ptr_->Mkl_mem_) + ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_); + else + ptr_->SetMKLMem(shape_, dtype_); + dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); +#endif + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + tblob_.dptr_ = dptr; + tblob_.shape_ = shape; + tblob_.type_flag_ = dtype_; + tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); +} /*! * \brief run a ternary operation @@ -451,6 +865,16 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext from.ctx(), to.ctx(), ctx); } +#if MXNET_USE_MKLDNN == 1 +inline void CopyFromToMKLDNNImpl(const NDArray& from, const NDArray& to, RunContext ctx) { + auto from_mem = from.GetMKLDNNData(); + auto to_mem = to.GetMKLDNNData(); + size_t size = std::min(from_mem->get_primitive_desc().get_size(), + to_mem->get_primitive_desc().get_size()); + memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); +} +#endif + // Make a copy of an NDArray based on storage type template void CopyFromToImpl(const NDArray& from, const NDArray& to, RunContext rctx) { @@ -500,6 +924,10 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to, RunContext rctx) { CopyFromToRspImpl(casted_nd, to, rctx); } else if (to_stype == kCSRStorage) { CopyFromToCsrImpl(casted_nd, to, rctx); +#if MXNET_USE_MKLDNN == 1 + } else if (to_stype == kMKLDNNStorage) { + CopyFromToMKLDNNImpl(casted_nd, to, rctx); +#endif } else { LOG(FATAL) << "unknown storage type" << to_stype; } diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h deleted file mode 100644 index bb5a37fc8794..000000000000 --- a/src/operator/activation-inl.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file activation-inl.h - * \brief Activation operator - * \author Bing Xu -*/ - -#ifndef MXNET_OPERATOR_ACTIVATION_INL_H_ -#define MXNET_OPERATOR_ACTIVATION_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "./operator_common.h" -#include "./mxnet_op.h" - -namespace mxnet { -namespace op { -// Declare enumeration of input order to make code more intuitive. -// // These enums are only visible within this header -namespace activation { -enum ActivationOpInputs {kData}; -enum ActivationOpOutputs {kOut}; -enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU}; -} // activation - -struct ActivationParam : public dmlc::Parameter { - // use int for enumeration - int act_type; - DMLC_DECLARE_PARAMETER(ActivationParam) { - DMLC_DECLARE_FIELD(act_type) - .add_enum("relu", activation::kReLU) - .add_enum("sigmoid", activation::kSigmoid) - .add_enum("tanh", activation::kTanh) - .add_enum("softrelu", activation::kSoftReLU) - .describe("Activation function to be applied."); - } -}; - -/** - * \brief This is the implementation of activation operator. - * \tparam xpu The device that the op will be executed on. - */ -template -class ActivationOp : public Operator { - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - const TBlob& input = in_data[activation::kData]; - const size_t sz = input.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, { - mxnet_op::Kernel, xpu>::Launch( - s, sz, - out_data[activation::kOut].dptr(), - input.dptr()); - }); - } - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1U); - Stream *s = ctx.get_stream(); - const TBlob& m_out_grad = out_grad[activation::kOut]; - const TBlob& m_out_data = out_data[activation::kOut]; - const TBlob& m_in_grad = in_grad[activation::kData]; - const size_t sz = m_out_data.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, { - mxnet_op::Kernel, Req>, xpu>::Launch( - s, sz, - m_in_grad.dptr(), - m_out_grad.dptr(), - m_out_data.dptr()); - }); - } - } -}; // class ActivationOp - -// Declare Factory function, used for dispatch specialization -template -Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape); - -#if DMLC_USE_CXX11 -class ActivationProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(activation::kData); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ActivationProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Activation"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { -#if MXNET_USE_CUDNN == 1 - return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]}; -#else - return {out_grad[activation::kOut], out_data[activation::kOut]}; -#endif // MXNET_USE_CUDNN - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[activation::kOut], in_grad[activation::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[activation::kData], out_data[activation::kOut]}}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - ActivationParam param_; -}; -#endif // DMLC_USE_CXX11 -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_ACTIVATION_INL_H_ diff --git a/src/operator/activation.cc b/src/operator/activation.cc deleted file mode 100644 index a33c11ce546d..000000000000 --- a/src/operator/activation.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file activation.cc - * \brief activation op - * \author Bing Xu -*/ -#include "./activation-inl.h" -#include "./mshadow_op.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_relu-inl.h" -#endif // MXNET_USE_MKL2017 - -namespace mxnet { -namespace op { -template<> -Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - if (param.act_type == activation::kReLU && dshape.ndim() <= 4) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLReluOp(); - case mshadow::kFloat64: - return new MKLReluOp(); - default: - break; - } - } - if (enableMKLWarnGenerated()) - LOG(INFO) << MKLReluOp::getName() << " Skip MKL optimization"; -#endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.act_type) { - case activation::kReLU: - op = new ActivationOp(); - break; - case activation::kSigmoid: - op = new ActivationOp(); - break; - case activation::kTanh: - op = new ActivationOp(); - break; - case activation::kSoftReLU: - op = new ActivationOp(); - break; - default: - LOG(FATAL) << "unknown activation type"; - } - }) - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); -} - -DMLC_REGISTER_PARAMETER(ActivationParam); - -MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp) -.describe(R"code(Applies an activation function element-wise to the input. - -The following activation functions are supported: - -- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)` -- `sigmoid`: :math:`y = \frac{1}{1 + exp(-x)}` -- `tanh`: Hyperbolic tangent, :math:`y = \frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}` -- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` - -)code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") -.add_arguments(ActivationParam::__FIELDS__()); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/activation.cu b/src/operator/activation.cu deleted file mode 100644 index 0ac51ad03109..000000000000 --- a/src/operator/activation.cu +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file activation.cu - * \brief - * \author Bing Xu -*/ -#include "./activation-inl.h" -#include "./mshadow_op.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_activation-inl.h" -#endif - -namespace mxnet { -namespace op { -template<> -Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { - Operator *op = NULL; - // SoftReLU not supported by CUDNN yet - if (param.act_type == activation::kSoftReLU) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ActivationOp(); - }) - return op; - } - -#if MXNET_USE_CUDNN == 1 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNActivationOp(param); - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.act_type) { - case activation::kReLU: - op = new ActivationOp(); - break; - case activation::kSigmoid: - op = new ActivationOp(); - break; - case activation::kTanh: - op = new ActivationOp(); - break; - default: - LOG(FATAL) << "unknown activation"; - } - }) -#endif // MXNET_USE_CUDNN - return op; -} -} // namespace op -} // namespace mxnet diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc deleted file mode 100644 index 55cfe4e085dc..000000000000 --- a/src/operator/convolution.cc +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file convolution.cc - * \brief - * \author Bing Xu, Jun Wu -*/ - -#include "./convolution-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_convolution-inl.h" -#endif // MXNET_USE_MKL2017 -#if MXNET_USE_NNPACK == 1 -#include "./nnpack/nnpack_convolution-inl.h" -#endif // MXNET_USE_NNPACK - -namespace mxnet { -namespace op { -DMLC_REGISTER_PARAMETER(ConvolutionParam); - -template<> -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - // If 1D convolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; - } -#if MXNET_USE_MKL2017 == 1 - if ((param.dilate[0] == 1 && param.dilate[1] == 1) - && param.kernel.ndim() == 2) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLConvolutionOp(param); - case mshadow::kFloat64: - return new MKLConvolutionOp(param); - default: - break; - } - } -#endif -#if MXNET_USE_NNPACK == 1 - const size_t batch_size = (*in_shape)[0][0]; - if ((param.dilate[0] == 1 && param.dilate[1] == 1) - && param.kernel.ndim() == 2 && (!param.no_bias) - && param.num_group == 1 && (batch_size == 1 || - ((batch_size > 1) && (param.stride[0] == 1) && - (param.stride[1] == 1)))) { - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKConvolutionOp(param); - default: - break; - } - } -#endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *ConvolutionProp::CreateOperatorEx(Context ctx, - std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); -} - -MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp) -.describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. - -In the 2-D convolution, given input data with shape *(batch_size, -channel, height, width)*, the output is computed by - -.. math:: - - out[n,i,:,:] = bias[i] + \sum_{j=0}^{channel} data[n,j,:,:] \star - weight[i,j,:,:] - -where :math:`\star` is the 2-D cross-correlation operator. - -For general 2-D convolution, the shapes are - -- **data**: *(batch_size, channel, height, width)* -- **weight**: *(num_filter, channel, kernel[0], kernel[1])* -- **bias**: *(num_filter,)* -- **out**: *(batch_size, num_filter, out_height, out_width)*. - -Define:: - - f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1 - -then we have:: - - out_height=f(height, kernel[0], pad[0], stride[0], dilate[0]) - out_width=f(width, kernel[1], pad[1], stride[1], dilate[1]) - -If ``no_bias`` is set to be true, then the ``bias`` term is ignored. - -The default data ``layout`` is *NCHW*, namely *(batch_size, channel, height, -width)*. We can choose other layouts such as *NHWC*. - -If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` -evenly into *g* parts along the channel axis, and also evenly split ``weight`` -along the first dimension. Next compute the convolution on the *i*-th part of -the data with the *i*-th weight part. The output is obtained by concatenating all -the *g* results. - -1-D convolution does not have *height* dimension but only *width* in space. - -- **data**: *(batch_size, channel, width)* -- **weight**: *(num_filter, channel, kernel[0])* -- **bias**: *(num_filter,)* -- **out**: *(batch_size, num_filter, out_width)*. - -3-D convolution adds an additional *depth* dimension besides *height* and -*width*. The shapes are - -- **data**: *(batch_size, channel, depth, height, width)* -- **weight**: *(num_filter, channel, kernel[0], kernel[1], kernel[2])* -- **bias**: *(num_filter,)* -- **out**: *(batch_size, num_filter, out_depth, out_height, out_width)*. - -Both ``weight`` and ``bias`` are learnable parameters. - -There are other options to tune the performance. - -- **cudnn_tune**: enable this option leads to higher startup time but may give - faster speed. Options are - - - **off**: no tuning - - **limited_workspace**:run test and pick the fastest algorithm that doesn't - exceed workspace limit. - - **fastest**: pick the fastest algorithm and ignore workspace limit. - - **None** (default): the behavior is determined by environment variable - ``MXNET_CUDNN_AUTOTUNE_DEFAULT``. 0 for off, 1 for limited workspace - (default), 2 for fastest. - -- **workspace**: A large number leads to more (GPU) memory usage but may improve - the performance. - -)code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.") -.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") -.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") -.add_arguments(ConvolutionParam::__FIELDS__()); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu deleted file mode 100644 index b327f3cff424..000000000000 --- a/src/operator/convolution.cu +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file convolution.cu - * \brief - * \author Bing Xu, Jun Wu -*/ - -#include "./convolution-inl.h" -#include -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_convolution-inl.h" -#endif // MXNET_USE_CUDNN - -#include "./depthwise_convolution-inl.h" - -namespace mxnet { -namespace op { - -template<> -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - // If 1D convolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; - } - - // depth wise conv - if (param.num_filter == param.num_group && - param.layout.value() == mshadow::kNCHW && - param.num_filter == (*in_shape)[conv::kData][1] && - param.kernel.ndim() == 2 && - param.dilate == mshadow::Shape2(1, 1) && - dtype == mshadow::kFloat32) { - op = new DepthwiseConvolutionOp(param, *in_shape, *out_shape); - return op; - } - -#if MXNET_USE_CUDNN == 1 - // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). - int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; - - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.cudnn_off) { - op = new ConvolutionOp(param); - } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx)) { - LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - op = new ConvolutionOp(param); - } else { - op = new CuDNNConvolutionOp(param, compute_type, compute_type, - *in_shape, *out_shape, ctx); - } - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) -#endif // MXNET_USE_CUDNN - return op; -} - -} // namespace op -} // namespace mxnet - diff --git a/src/operator/convolution_v1.cu b/src/operator/convolution_v1.cu index b20b4b249224..186462dd9cd3 100644 --- a/src/operator/convolution_v1.cu +++ b/src/operator/convolution_v1.cu @@ -25,9 +25,6 @@ #include "./convolution_v1-inl.h" #include -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_convolution-inl.h" -#endif // MXNET_USE_CUDNN namespace mxnet { namespace op { diff --git a/src/operator/cudnn_batch_norm.cc b/src/operator/cudnn_batch_norm.cc deleted file mode 100644 index 28c592b78ccf..000000000000 --- a/src/operator/cudnn_batch_norm.cc +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file cudnn_batch_norm.cc - * \brief - * \author Junyuan Xie -*/ - -#include "./cudnn_batch_norm-inl.h" -#include - -namespace mxnet { -namespace op { -#if CUDNN_MAJOR >= 4 -template<> -Operator *CreateOp_CuDNNv4(BatchNormParam param) { - LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; - return NULL; -} - -Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const { -#if CUDNN_MAJOR >= 5 - LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." - "Use the later instead."; - return nullptr; -#else - DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_); -#endif -} - -MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp) -.describe("Apply batch normalization to input.") -.add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") -.add_arguments(BatchNormParam::__FIELDS__()); - -NNVM_REGISTER_OP(CuDNNBatchNorm) -.set_attr("FSetInputVarAttrOnCompose", - [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { - if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; - if (index == 3) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } else if (index == 4) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } - }); -#endif // CUDNN_MAJOR >= 4 -} // namespace op -} // namespace mxnet diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h deleted file mode 100644 index 5b03fe5ee6f3..000000000000 --- a/src/operator/cudnn_pooling-inl.h +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file cudnn_pooling-inl.h - * \brief - * \author Bing Xu -*/ - -#ifndef MXNET_OPERATOR_CUDNN_POOLING_INL_H_ -#define MXNET_OPERATOR_CUDNN_POOLING_INL_H_ -#include -#include -#include "./pooling-inl.h" - -namespace mxnet { -namespace op { - -template -class CuDNNPoolingOp : public Operator { - public: - explicit CuDNNPoolingOp(PoolingParam p) { - param_ = p; - init_cudnn_ = false; - // TODO(xxx): fp16 - dtype_ = mshadow::DataType::kCudnnFlag; - switch (param_.pool_type) { - case pool_enum::kMaxPooling: - mode_ = CUDNN_POOLING_MAX; - break; - case pool_enum::kAvgPooling: - mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - default: - LOG(FATAL) << "Not implmented"; - } - } - - ~CuDNNPoolingOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); - } - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - typename DataType::ScaleType alpha = 1.0f; - typename DataType::ScaleType beta = 0.0f; - if (param_.kernel.ndim() == 2) { - // 2d pool - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - if (!init_cudnn_) { - this->Init(s, in_data, out_data); - } - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, - pooling_desc_, - &alpha, - in_desc_, - data.dptr_, - &beta, - out_desc_, - out.dptr_)); - } else if (param_.kernel.ndim() == 3) { - // 3d pool - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - if (!init_cudnn_) { - this->Init(s, in_data, out_data); - } - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, - pooling_desc_, - &alpha, - in_desc_, - data.dptr_, - &beta, - out_desc_, - out.dptr_)); - } else { - LOG(FATAL) << "Only support 2D or 3D pooling"; - } - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); - - Stream *s = ctx.get_stream(); - CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - typename DataType::ScaleType alpha = 1.0f; - typename DataType::ScaleType beta = 0.0f; - if (param_.kernel.ndim() == 2) { - // 2d pool - Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); - Tensor m_in_data = in_data[pool_enum::kData].get(s); - Tensor m_out_data = out_data[pool_enum::kOut].get(s); - Tensor m_in_grad = in_grad[pool_enum::kData].get(s); - CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, - pooling_desc_, - &alpha, - out_desc_, - m_out_data.dptr_, - out_desc_, - m_out_grad.dptr_, - in_desc_, - m_in_data.dptr_, - &beta, - in_desc_, - m_in_grad.dptr_)); - } else if (param_.kernel.ndim() == 3) { - // 3d pool - Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); - Tensor m_in_data = in_data[pool_enum::kData].get(s); - Tensor m_out_data = out_data[pool_enum::kOut].get(s); - Tensor m_in_grad = in_grad[pool_enum::kData].get(s); - CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, - pooling_desc_, - &alpha, - out_desc_, - m_out_data.dptr_, - out_desc_, - m_out_grad.dptr_, - in_desc_, - m_in_data.dptr_, - &beta, - in_desc_, - m_in_grad.dptr_)); - } else { - LOG(FATAL) << "Only support 2D or 3D pooling"; - } - } - - private: - inline void Init(mshadow::Stream *s, - const std::vector &in_data, - const std::vector &out_data) { - using namespace mshadow; - #if CUDNN_MAJOR >= 5 - nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; - #endif - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - if (!init_cudnn_) { - init_cudnn_ = true; - if (param_.kernel.ndim() == 2) { - // 2d conv - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - mshadow::Shape<4> dshape = data.shape_; - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - out.shape_[0], - out.shape_[1], - out.shape_[2], - out.shape_[3])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - nan_prop_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 :param_.stride[1])); - #else - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - #endif - } else { - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - std::vector ishape = {static_cast(data.shape_[0]), - static_cast(data.shape_[1]), - static_cast(data.shape_[2]), - static_cast(data.shape_[3]), - static_cast(data.shape_[4])}; - - std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[3] * ishape[4]), - static_cast(ishape[4]), - 1}; - - std::vector oshape = {static_cast(out.shape_[0]), - static_cast(out.shape_[1]), - static_cast(out.shape_[2]), - static_cast(out.shape_[3]), - static_cast(out.shape_[4])}; - - std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[3] * oshape[4]), - static_cast(oshape[4]), - 1}; - - std::vector kernel_vec = {param_.global_pool ? ishape[2] : - static_cast(param_.kernel[0]), - param_.global_pool ? ishape[3] : - static_cast(param_.kernel[1]), - param_.global_pool ? ishape[4] : - static_cast(param_.kernel[2])}; - - std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), - param_.global_pool ? 0 : static_cast(param_.pad[1]), - param_.global_pool ? 0 : static_cast(param_.pad[2])}; - - std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), - param_.global_pool ? 1 : static_cast(param_.stride[1]), - param_.global_pool ? 1 : static_cast(param_.stride[2])}; - - CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, - dtype_, - static_cast(ishape.size()), - &ishape[0], - &istride[0])); - CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, - dtype_, - static_cast(oshape.size()), - &oshape[0], - &ostride[0])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, - mode_, - nan_prop_, - static_cast(kernel_vec.size()), - &(kernel_vec[0]), - &(pad_vec[0]), - &(stride_vec[0]))); - #else - LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; - #endif - } - } - } - bool init_cudnn_; - cudnnDataType_t dtype_; - cudnnHandle_t handle_; - cudnnPoolingMode_t mode_; - cudnnTensorDescriptor_t in_desc_; - cudnnTensorDescriptor_t out_desc_; - cudnnPoolingDescriptor_t pooling_desc_; - #if CUDNN_MAJOR >= 5 - cudnnNanPropagation_t nan_prop_; - #endif - PoolingParam param_; -}; // class CuDNNPoolingOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_CUDNN_POOLING_INL_H_ - diff --git a/src/operator/deconvolution.cc b/src/operator/deconvolution.cc deleted file mode 100644 index 6a59ff6588ff..000000000000 --- a/src/operator/deconvolution.cc +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file deconvolution.cc - * \brief - * \author Wei Wu -*/ - -#include "./deconvolution-inl.h" - -namespace mxnet { -namespace op { -template<> -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); - }); - return op; -} - -Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx); -} - -DMLC_REGISTER_PARAMETER(DeconvolutionParam); - -MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp) -.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") -.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") -.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " - "operation.") -.add_arguments(DeconvolutionParam::__FIELDS__()) -.describe("Computes 2D transposed convolution (aka fractionally strided convolution) of the " - "input tensor. This operation can be seen as the gradient of Convolution operation with " - "respect to its input. Convolution usually reduces the size of the input. Transposed " - "convolution works the other way, going from a smaller input to a larger output while " - "preserving the connectivity pattern."); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/deconvolution.cu b/src/operator/deconvolution.cu deleted file mode 100644 index de7dff5569ed..000000000000 --- a/src/operator/deconvolution.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file deconvolution.cu - * \brief - * \author Wei Wu -*/ - -#include "./deconvolution-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_deconvolution-inl.h" -#endif // MXNET_USE_CUDNN - -namespace mxnet { -namespace op { -template<> -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - // Logic here parallels that in Convolution.cu - Operator *op = NULL; - // If 1D deconvolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); - }) - return op; - } -#if MXNET_USE_CUDNN == 1 - // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). - int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; - - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.cudnn_off) { - op = new DeconvolutionOp(param); - } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx)) { - LOG(WARNING) << - "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - op = new DeconvolutionOp(param); - } else { - op = new CuDNNDeconvolutionOp(param, compute_type, compute_type, - *in_shape, *out_shape, ctx); - } - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); - }) -#endif // MXNET_USE_CUDNN - return op; -} - -} // namespace op -} // namespace mxnet diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc deleted file mode 100644 index af65578ec6f8..000000000000 --- a/src/operator/dropout.cc +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file dropout.cc - * \brief - * \author Bing Xu -*/ - -#include "./dropout-inl.h" - -namespace mxnet { -namespace op { -template<> -Operator *CreateOp(DropoutParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DropoutOp(param); - }); - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); -} - -DMLC_REGISTER_PARAMETER(DropoutParam); - -MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp) -.describe(R"(Applies dropout operation to input array. - -- During training, each element of the input is set to zero with probability p. - The whole array is rescaled by :math:`1/(1-p)` to keep the expected - sum of the input unchanged. - -- During testing, this operator does not change the input if mode is 'training'. - If mode is 'always', the same computaion as during training will be applied. - -Example:: - - random.seed(998) - input_array = array([[3., 0.5, -0.5, 2., 7.], - [2., -0.4, 7., 3., 0.2]]) - a = symbol.Variable('a') - dropout = symbol.Dropout(a, p = 0.2) - executor = dropout.simple_bind(a = input_array.shape) - - ## If training - executor.forward(is_train = True, a = input_array) - executor.outputs - [[ 3.75 0.625 -0. 2.5 8.75 ] - [ 2.5 -0.5 8.75 3.75 0. ]] - - ## If testing - executor.forward(is_train = False, a = input_array) - executor.outputs - [[ 3. 0.5 -0.5 2. 7. ] - [ 2. -0.4 7. 3. 0.2 ]] -)" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.") -.add_arguments(DropoutParam::__FIELDS__()); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc deleted file mode 100644 index 82c32a7d2546..000000000000 --- a/src/operator/fully_connected.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file fully_connected.cc - * \brief fully connect operator -*/ -#include "./fully_connected-inl.h" -#if MXNET_USE_NNPACK == 1 -#include "./nnpack/nnpack_fully_connected-inl.h" -#endif // MXNET_USE_NNPACK - -namespace mxnet { -namespace op { -template<> -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; -#if MXNET_USE_NNPACK == 1 - const size_t batch_size = (*in_shape)[0][0]; - // nnp_fully_connected_inference will do optimization for batch-size = 1 - // nnp_fully_connected_output will do optimization for batch-size > 1 - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKFullyConnectedOp(param); - default: - break; - } -#endif - switch (dtype) { - case mshadow::kFloat32: - op = new FullyConnectedOp(param); - break; - case mshadow::kFloat64: - op = new FullyConnectedOp(param); - break; - case mshadow::kFloat16: - LOG(FATAL) << "float16 fully connected layer is currently" - "only supported by CuDNN version."; - break; - default: - LOG(FATAL) << "Unsupported type " << dtype; - } - - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape(1, TShape()), aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); -} - -DMLC_REGISTER_PARAMETER(FullyConnectedParam); - -MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp) -.describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`. - -If ``flatten`` is set to be true, then the shapes are: - -- **data**: `(batch_size, x1, x2, ..., xn)` -- **weight**: `(num_hidden, x1 * x2 * ... * xn)` -- **bias**: `(num_hidden,)` -- **out**: `(batch_size, num_hidden)` - -If ``flatten`` is set to be false, then the shapes are: - -- **data**: `(x1, x2, ..., xn, input_dim)` -- **weight**: `(num_hidden, input_dim)` -- **bias**: `(num_hidden,)` -- **out**: `(x1, x2, ..., xn, num_hidden)` - -The learnable parameters include both ``weight`` and ``bias``. - -If ``no_bias`` is set to be true, then the ``bias`` term is ignored. - -)code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input data.") -.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") -.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") -.add_arguments(FullyConnectedParam::__FIELDS__()); -} // namespace op -} // namespace mxnet diff --git a/src/operator/fully_connected.cu b/src/operator/fully_connected.cu deleted file mode 100644 index 28a0307b70bd..000000000000 --- a/src/operator/fully_connected.cu +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file fully_connected.cu - * \brief fully connect operator -*/ -#include "./fully_connected-inl.h" -namespace mxnet { -namespace op { -template<> -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new FullyConnectedOp(param); - }) - return op; -} -} // namespace op -} // namespace mxnet diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h deleted file mode 100644 index b5967f4de294..000000000000 --- a/src/operator/mkl/mkl_batch_norm-inl.h +++ /dev/null @@ -1,391 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_batch_norm-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLBatchNormOp : public Operator { - public: - explicit MKLBatchNormOp(BatchNormParam param) { - this->param_ = param; - fwd_top_data = MKLData::create(); - fwd_bottom_data = MKLData::create(); - bwd_top_diff = MKLData::create(); - bwd_bottom_diff = MKLData::create(); - scaleShift_space.dptr = NULL; - scaleShiftDiff_space.dptr = NULL; - } - virtual ~MKLBatchNormOp() { - if (batchNormFwdInference != NULL) dnnDelete(batchNormFwdInference); - if (batchNormFwdTraining != NULL) dnnDelete(batchNormFwdTraining); - if (batchNormBwdScaleShift != NULL) dnnDelete(batchNormBwdScaleShift); - dnnLayoutDelete(layout_usr_); - if (scaleShift_space.dptr) - Storage::Get()->Free(scaleShift_space); - if (scaleShiftDiff_space.dptr) - Storage::Get()->Free(scaleShiftDiff_space); - } - static std::string getName() { - return "MKLBatchNormOp"; - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - eps_ = param_.eps; - size_t dim = 4, sizes[4], strides[4]; - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - - sizes[0] = width_; - sizes[1] = height_; - sizes[2] = channels_; - sizes[3] = num_; - - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - - // Names are for debugging only - fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); - fwd_top_data->name = "fwd_top_data @ " + getName(); - bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); - bwd_top_diff->name = "bwd_top_diff @ " + getName(); - - dnnError_t e; - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); - - fwd_bottom_data->create_user_layout(dim, sizes, strides); - fwd_top_data->create_user_layout(dim, sizes, strides); - bwd_bottom_diff->create_user_layout(dim, sizes, strides); - bwd_top_diff->create_user_layout(dim, sizes, strides); - - // Primitives will be allocated during the first fwd pass - batchNormFwdInference = NULL; - batchNormFwdTraining = NULL; - batchNormBwdScaleShift = NULL; - int scaleShift_size = channels_*2*sizeof(DType); - scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); - scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); - DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); - /*!use_weight_bias_*/ - for (int i = 0; i < channels_; i++) { - scaleShift_buf[i] = 1.0; - scaleShift_buf[channels_ + i] = 0; - } - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(aux_states.size(), 2); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 3); - CHECK_EQ(req.size(), 3); - } else { - CHECK_GE(out_data.size(), 1); - CHECK_GE(req.size(), 1); - CHECK_EQ(req[batchnorm::kOut], kWriteTo); - } - - Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - if (in_data[batchnorm::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0], - in_data[batchnorm::kData].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[batchnorm::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[batchnorm::kOut], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); - out = mkl_experimental_direct_get(out_data[batchnorm::kOut], s); - } - - // const real_t scale = static_cast(in_data[batchnorm::kData].shape_[1]) / - // static_cast(in_data[batchnorm::kData].shape_.Size()); - - Tensor slope = in_data[batchnorm::kGamma].get(s); - Tensor bias = in_data[batchnorm::kBeta].get(s); - Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); - Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); - - if (param_.fix_gamma) - slope = 1.f; - - dnnError_t e; - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); -#endif - int bwd_flags = dnnUseScaleShift; - if (param_.use_global_stats) - bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance; -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - // Is it the first pass? Create a primitive. - if (batchNormFwdInference == NULL) { - std::shared_ptr bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_; - std::shared_ptr bottom_prv_desc = bottom_data_mem->get_prv_descriptor(); - CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_desc); - CHECK(mem_descr != NULL); - fwd_bottom_data = mem_descr; - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdInference, NULL, mem_descr->layout_int, eps_, - dnnUseInputMeanVariance | dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_, - dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst); - bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst); - bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc); - - e = dnnBatchNormalizationCreateBackward_v2( - &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags); - CHECK_EQ(e, E_SUCCESS); - } - } -#endif - if (NULL == bottom_data) { - if (batchNormFwdInference == NULL) { - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdInference, NULL, layout_usr_, eps_, - dnnUseInputMeanVariance | dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateBackward_v2( - &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags); - CHECK_EQ(e, E_SUCCESS); - } - bottom_data = reinterpret_cast(data.dptr_); - } - - DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); - // use_weight_bias_ - for (int i = 0; i < channels_; i++) { - scaleShift_buf[i] = (slope.dptr_)[i]; - } - for (int i = 0; i < channels_; i++) { - scaleShift_buf[channels_ + i] = (bias.dptr_)[i]; - } - - void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceSrc] = bottom_data; - BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; - - BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_, - fwd_top_data, out_data[batchnorm::kOut]); - if (ctx.is_train && !param_.use_global_stats) { - Tensor mean = out_data[batchnorm::kMean].get(s); - Tensor var = out_data[batchnorm::kVar].get(s); - CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo); - CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo); - BatchNorm_res[dnnResourceMean] = mean.dptr_; - BatchNorm_res[dnnResourceVariance] = var.dptr_; - e = dnnExecute(batchNormFwdTraining, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); - } else { - BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; - BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; - e = dnnExecute(batchNormFwdInference, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); - } - -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out.dptr_); - } -#endif - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(out_data.size(), 3); - CHECK_EQ(in_grad.size(), 3); - Stream *s = ctx.get_stream(); - Tensor data, grad, grad_in; - - if (in_data[batchnorm::kData].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0], - out_grad[batchnorm::kOut].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[batchnorm::kData], dshape, s); - grad = mkl_experimental_direct_get_with_shape( - out_grad[batchnorm::kOut], dshape, s); - grad_in = mkl_experimental_direct_get_with_shape( - in_grad[batchnorm::kData], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); - grad = mkl_experimental_direct_get(out_grad[batchnorm::kOut], s); - grad_in = mkl_experimental_direct_get(in_grad[batchnorm::kData], s); - } - - Tensor slope = in_data[batchnorm::kGamma].get(s); - Tensor gslope = in_grad[batchnorm::kGamma].get(s); - Tensor gbias = in_grad[batchnorm::kBeta].get(s); - Tensor mean = out_data[batchnorm::kMean].get(s); - Tensor var = out_data[batchnorm::kVar].get(s); - Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); - Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); - - if (param_.fix_gamma) slope = 1.f; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); -#endif - if (NULL == bottom_data) - bottom_data = reinterpret_cast(data.dptr_); - - dnnError_t e; - void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceSrc] = bottom_data; - BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; - if (ctx.is_train && !param_.use_global_stats) { - int size = mean.size(0); // Tensor - float * moving_mean_ptr = reinterpret_cast(moving_mean.dptr_); - float * mean_ptr = reinterpret_cast(mean.dptr_); - float * moving_var_ptr = reinterpret_cast(moving_var.dptr_); - float * var_ptr = reinterpret_cast(var.dptr_); - float minus_mom = (1 - param_.momentum); - for (int i = 0; i < size; i++) { - moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum - + mean_ptr[i] * minus_mom; - } - for (int i = 0; i < size; i++) { - moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum - + var_ptr[i] * minus_mom; - } - BatchNorm_res[dnnResourceMean] = mean.dptr_; - BatchNorm_res[dnnResourceVariance] = var.dptr_; - } else { - BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; - BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; - } - - - BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_, - bwd_bottom_diff, in_grad[batchnorm::kData]); - BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_, - true, out_grad[batchnorm::kOut]); - BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr; - e = dnnExecute(batchNormBwdScaleShift, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff->conversion_needed()) { - bwd_bottom_diff->convert_from_prv(grad_in.dptr_); - } -#endif - DType * scaleShiftDiff_buf = reinterpret_cast(scaleShiftDiff_space.dptr); - if (!param_.fix_gamma) { - // Store ScaleShift blobs - DType* diff_scale = gslope.dptr_; - for (int i = 0; i < channels_; i++) { - diff_scale[i] = scaleShiftDiff_buf[i]; - } - } else { - int gslope_size = gslope.size(0); - float * gslope_ptr = reinterpret_cast(gslope.dptr_); - for (int i = 0; i < gslope_size; i++) { - *gslope_ptr++ = 0.0f; - } - } - DType* diff_shift = gbias.dptr_; - for (int i = 0; i < channels_; i++) { - diff_shift[i] = scaleShiftDiff_buf[channels_ + i]; - } - } - - private: - BatchNormParam param_; - DType eps_; - bool use_weight_bias_; - - int num_; - int channels_; - int height_; - int width_; - bool init_mkldnn_ = false; - std::shared_ptr > fwd_top_data; - std::shared_ptr > fwd_bottom_data; - std::shared_ptr > bwd_top_diff; - std::shared_ptr > bwd_bottom_diff; - dnnPrimitive_t batchNormFwdInference = NULL; - dnnPrimitive_t batchNormFwdTraining = NULL; - dnnPrimitive_t batchNormBwdScaleShift = NULL; - Storage::Handle scaleShift_space; - Storage::Handle scaleShiftDiff_space; - dnnLayout_t layout_usr_ = NULL; -}; // class BatchNormOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h deleted file mode 100644 index 1ed1e81d1303..000000000000 --- a/src/operator/mkl/mkl_concat-inl.h +++ /dev/null @@ -1,314 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_concat-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../channel_op_common.h" -#include "./mkl_util-inl.h" -namespace mxnet { -namespace op { - - -template -class MKLConcatOp : public Operator { - public: - static std::string getName() { - return "MKLConcatOp"; - } - explicit MKLConcatOp(ConcatParam param) - : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) { - concatFwd_ = static_cast(NULL); - concatBwd_ = static_cast(NULL); - fwd_top_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - - num_concats_ = param.num_args; - } - virtual ~MKLConcatOp() { - dnnDelete(concatFwd_); - dnnDelete(concatBwd_); - } - - private: - void LayerSetUp(const std::vector > &data, - const mshadow::Tensor &out, - size_t data_shape_size, size_t *split_channels_) { - size_t dim_src = data_shape_size; - size_t dim_dst = dim_src; - num_concats_ = size_; - channels_ = 0; - - for (size_t i = 1; i < num_concats_; ++i) { - for (size_t j = 1; j < data_shape_size; ++j) { - if (j == dimension_) continue; - CHECK_EQ(data[0].shape_[j], data[i].shape_[j]); - } - } - - for (size_t i = 0; i < num_concats_; ++i) { - CHECK_EQ((int)dim_src, data[i].shape_.kDimension); - - fwd_bottom_data_.push_back(MKLData::create()); - bwd_bottom_diff_.push_back(MKLData::create()); - fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]"; - bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]"; - - size_t *sizes_src = new size_t[dim_src]; - size_t *strides_src = new size_t[dim_src]; - for (size_t d = 0; d < dim_src; ++d) { - sizes_src[d] = data[i].shape_[dim_src - d - 1]; - strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; - } - - split_channels_[i] = data[i].shape_[1]; - channels_ += split_channels_[i]; - fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src); - bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src); - delete[] sizes_src; - delete[] strides_src; - } - size_t *sizes_dst = new size_t[dim_dst]; - size_t *strides_dst = new size_t[dim_dst]; - for (size_t d = 0; d < dim_dst; ++d) { - if (d == 2) - sizes_dst[d] = channels_; - else - sizes_dst[d] = data[0].shape_[dim_dst - 1 - d]; - strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1]; - } - bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst); - fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst); - delete[] sizes_dst; - delete[] strides_dst; - concatFwd_ = NULL; - concatBwd_ = NULL; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(static_cast(in_data.size()), size_); - CHECK_EQ(out_data.size(), 1); - CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim()); - Stream *s = ctx.get_stream(); - std::vector > data(size_); - Tensor out; - if (in_data[0].ndim() == 2) { - for (int i = 0; i < size_; ++i) { - Shape<4> dshape = Shape4(in_data[i].shape_[0], - in_data[i].shape_[1], 1, 1); - data[i] = mkl_experimental_direct_get_with_shape( - in_data[i], dshape, s); - } - Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], - out_data[concat_enum::kOut].shape_[1], 1, 1); - out = mkl_experimental_direct_get_with_shape( - out_data[concat_enum::kOut], dshape, s); - } else if (in_data[0].ndim() == 3) { - for (int i = 0; i < size_; ++i) { - Shape<4> dshape = Shape4(in_data[i].shape_[0], - in_data[i].shape_[1], in_data[i].shape_[2], 1); - data[i] = mkl_experimental_direct_get_with_shape( - in_data[i], dshape, s); - } - Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], - out_data[concat_enum::kOut].shape_[1], - out_data[concat_enum::kOut].shape_[2], 1); - out = mkl_experimental_direct_get_with_shape( - out_data[concat_enum::kOut], dshape, s); - } else { - for (int i = 0; i < size_; ++i) { - data[i] = mkl_experimental_direct_get(in_data[i], s); - } - out = mkl_experimental_direct_get(out_data[concat_enum::kOut], s); - } - size_t *split_channels_ = new size_t[num_concats_]; - if (!init_mkldnn_) { - init_mkldnn_ = true; - LayerSetUp(data, out, 4, split_channels_); - } - - dnnError_t e; - std::vector bottom_data; - bool isFirstPass = (concatFwd_ == NULL); - dnnLayout_t *layouts = NULL; - if (isFirstPass) { - layouts = new dnnLayout_t[num_concats_]; - } - - for (size_t i = 0; i < num_concats_; i++) { - void * bottom_i = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_i = mkl_prv_data(in_data[i]); - if (bottom_i != NULL) { - if (isFirstPass) { - std::shared_ptr > mem_descr = - mkl_get_mem_desc(in_data[i].Mkl_mem_); - fwd_bottom_data_[i] = mem_descr; - layouts[i] = mem_descr->layout_int; - } - } -#endif - if (bottom_i == NULL) { - bottom_i = data[i].dptr_; - if (isFirstPass) { - layouts[i] = fwd_bottom_data_[i]->layout_usr; - } - } - - bottom_data.push_back(reinterpret_cast(bottom_i)); - } - - if (isFirstPass) { - e = dnnConcatCreate(&concatFwd_, NULL, num_concats_, layouts); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst); - bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst); - - e = dnnSplitCreate(&concatBwd_, NULL, num_concats_, - bwd_top_diff_->layout_int, split_channels_); - CHECK_EQ(e, E_SUCCESS); - - for (size_t n = 0; n < num_concats_; ++n) { - fwd_bottom_data_[n]->create_internal_layout(concatFwd_, - (dnnResourceType_t)(dnnResourceMultipleSrc + n)); - bwd_bottom_diff_[n]->create_internal_layout(concatBwd_, - (dnnResourceType_t)(dnnResourceMultipleDst + n)); - } - } - delete[] layouts; - - void *concat_res[dnnResourceNumber]; - for (size_t i = 0; i < num_concats_; ++i) { - concat_res[dnnResourceMultipleSrc + i] - = reinterpret_cast(bottom_data[i]); - } - - concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_, - fwd_top_data_, out_data[concat_enum::kOut]); - e = dnnExecute(concatFwd_, concat_res); - CHECK_EQ(e, E_SUCCESS); - delete[] split_channels_; - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_grad.size(), static_cast(size_)); - Stream *s = ctx.get_stream(); - std::vector > grad_in(size_); - Tensor grad; - if (in_grad[0].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], - out_grad[concat_enum::kOut].shape_[1], 1, 1); - grad = mkl_experimental_direct_get_with_shape( - out_grad[concat_enum::kOut], dshape, s); - for (int i = 0; i < size_; ++i) { - dshape = Shape4(in_grad[i].shape_[0], - in_grad[i].shape_[1], 1, 1); - grad_in[i] = mkl_experimental_direct_get_with_shape( - in_grad[i], dshape, s); - } - } else if (in_grad[0].ndim() == 3) { - Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], - out_grad[concat_enum::kOut].shape_[1], - out_grad[concat_enum::kOut].shape_[2], 1); - grad = mkl_experimental_direct_get_with_shape( - out_grad[concat_enum::kOut], dshape, s); - for (int i = 0; i < size_; ++i) { - dshape = Shape4(in_grad[i].shape_[0], - in_grad[i].shape_[1], in_grad[i].shape_[2], 1); - grad_in[i] = mkl_experimental_direct_get_with_shape( - in_grad[i], dshape, s); - } - } else { - grad = mkl_experimental_direct_get(out_grad[concat_enum::kOut], s); - for (int i = 0; i < size_; ++i) { - grad_in[i] = mkl_experimental_direct_get(in_grad[i], s); - } - } - - int need_bwd = 0; - for (size_t n = 0; n < num_concats_; n++) { - need_bwd += req[n]; - } - if (!need_bwd) { - return; - } - - dnnError_t e; - void *concat_res[dnnResourceNumber]; - concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true, - out_grad[concat_enum::kOut]); - for (size_t i = 0; i < num_concats_; ++i) { - concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr( - grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]); - } - e = dnnExecute(concatBwd_, concat_res); - CHECK_EQ(e, E_SUCCESS); - } - - private: - int size_; - size_t dimension_; - - bool init_mkldnn_; - - dnnPrimitive_t concatFwd_; - dnnPrimitive_t concatBwd_; - std::shared_ptr > fwd_top_data_; - std::vector< std::shared_ptr > > fwd_bottom_data_; - std::shared_ptr > bwd_top_diff_; - std::vector< std::shared_ptr > > bwd_bottom_diff_; - - - size_t width_; - size_t height_; - size_t channels_; - size_t num_; - size_t num_concats_; -}; // class MKLConcatOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h deleted file mode 100644 index 870e568a96f3..000000000000 --- a/src/operator/mkl/mkl_convolution-inl.h +++ /dev/null @@ -1,490 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_convolution-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../convolution-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLConvolutionOp : public Operator { - public: - static std::string getName() { - return "MKLConvolutionOp"; - } - void SetupBuffer() { - convolutionBwdBias = static_cast(NULL); - convolutionBwdFilter = static_cast(NULL); - convolutionBwdData = static_cast(NULL); - convolutionFwd = static_cast(NULL); - fwd_bottom_data = MKLData::create(); - fwd_top_data = MKLData::create(); - fwd_filter_data = MKLData::create(); - fwd_bias_data = MKLData::create(); - bwdd_top_diff = MKLData::create(); - bwdd_bottom_diff = MKLData::create(); - bwdd_filter_data = MKLData::create(); - bwdf_top_diff = MKLData::create(); - bwdf_filter_diff = MKLData::create(); - bwdf_bottom_data = MKLData::create(); - bwdb_top_diff = MKLData::create(); - bwdb_bias_diff = MKLData::create(); - // Names are for debugging purposes only. - fwd_bottom_data->name = "fwd_bottom_data @ " + this->getName(); - fwd_top_data->name = "fwd_top_data @ " + this->getName(); - fwd_filter_data->name = "fwd_filter_data @ " + this->getName(); - fwd_bias_data->name = "fwd_bias_data @ " + this->getName(); - bwdd_top_diff->name = "bwdd_top_diff @ " + this->getName(); - bwdd_bottom_diff->name = "bwdd_bottom_diff @ " + this->getName(); - bwdd_filter_data->name = "bwdd_filter_data @ " + this->getName(); - bwdf_top_diff->name = "bwdf_top_diff @ " + this->getName(); - bwdf_bottom_data->name = "bwdf_bottom_data @ " + this->getName(); - bwdf_filter_diff->name = "bwdf_filter_diff @ " + this->getName(); - bwdb_top_diff->name = "bwdb_top_diff @ " + this->getName(); - bwdb_bias_diff->name = "bwdb_bias_diff @ " + this->getName(); - } - - explicit MKLConvolutionOp(ConvolutionParam p): - convolutionFwd(NULL), - convolutionBwdData(static_cast(NULL)), - convolutionBwdFilter(static_cast(NULL)), - convolutionBwdBias(static_cast(NULL)) { - this->param_ = p; - init_mkldnn_ = false; - // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(DType); - SetupBuffer(); - } - void ReleaseBuffer() { - if (convolutionFwd != NULL) { - dnnDelete(convolutionFwd); - convolutionFwd = NULL; - } - if (convolutionBwdData != NULL) { - dnnDelete(convolutionBwdData); - convolutionBwdData = NULL; - } - if (convolutionBwdFilter != NULL) { - dnnDelete(convolutionBwdFilter); - convolutionBwdFilter = NULL; - } - if (!param_.no_bias && convolutionBwdBias != NULL) { - dnnDelete(convolutionBwdBias); - convolutionBwdBias = NULL; - } - } - virtual ~MKLConvolutionOp() { - ReleaseBuffer(); - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - this->width_ = data.shape_[3]; - this->height_ = data.shape_[2]; - this->channels_ = data.shape_[1]; - this->num_ = data.shape_[0]; - this->group_ = param_.num_group; - this->width_out_ = out.shape_[3]; - this->height_out_ = out.shape_[2]; - int channel_out_ = out.shape_[1]; - this->num_output_ = channel_out_; - kernel_w_ = param_.kernel[1]; - kernel_h_ = param_.kernel[0]; - stride_w_ = param_.stride[1]; - stride_h_ = param_.stride[0]; - pad_w_ = param_.pad[1]; - pad_h_ = param_.pad[0]; - int status; - size_t n, g; - size_t iw, ih, ic; - size_t ow, oh, oc; - size_t kw, kh; - size_t dimension = 4; - g = std::max(this->group_, 1); - n = this->num_; - iw = this->width_; - ih = this->height_; - ic = this->channels_; - ow = this->width_out_; - oh = this->height_out_; - oc = this->num_output_; - kw = this->kernel_w_; - kh = this->kernel_h_; - oc = this->num_output_; - size_t bdata_sizes[4] = { iw, ih, ic, n }; - size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic }; - /* starting with MKL 2017 Gold in case of groups filter layout - * becomes 5D, i.e. groups become a separate dimension */ - size_t g_mkl2017 = g; - size_t f_dimension = dimension + (g != 1); - if (getMKLBuildDate() < 20160701) { - g_mkl2017 = 1; - f_dimension = dimension; - } - size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 }; - size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g }; - size_t bias_sizes[1] = { oc }; - size_t bias_strides[1] = { 1 }; - size_t tdata_sizes[4] = { ow, oh, oc, n }; - size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc }; - size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ }; - int inputOffset[2] = { -this->pad_w_, -this->pad_h_ }; - // Names are for debugging purposes only. - /*** convolution section ***/ - if (!param_.no_bias) { - status = dnnGroupsConvolutionCreateForwardBias(&convolutionFwd, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - } else { - status = dnnGroupsConvolutionCreateForward(&convolutionFwd, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - } - CHECK_EQ(status, 0) - << "Failed dnnCreateConvolution(dnnForward) with status " - << status << "\n"; - fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension, - bdata_sizes, bdata_strides); - fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension, - tdata_sizes, tdata_strides); - fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter, - f_dimension, fdata_sizes, fdata_strides); - if (!param_.no_bias) - fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1, - bias_sizes, bias_strides); - /* - * Backward by data layer setup - */ - status = dnnGroupsConvolutionCreateBackwardData(&convolutionBwdData, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardData with status " - << status << "\n"; - bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc, - dimension, bdata_sizes, bdata_strides); - bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter, - f_dimension, fdata_sizes, fdata_strides); - /* - * Backward by filter layer setup - */ - status = dnnGroupsConvolutionCreateBackwardFilter(&convolutionBwdFilter, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardFilter with status " - << status << "\n"; - bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc, - dimension, bdata_sizes, bdata_strides); - bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter, - f_dimension, fdata_sizes, fdata_strides); - /* - * Backward by bias layer setup - */ - if (!param_.no_bias) { - status = dnnGroupsConvolutionCreateBackwardBias(&convolutionBwdBias, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - tdata_sizes); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardBias with status " - << status << "\n"; - bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1, - bias_sizes, bias_strides); - } - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - DType *data_ptr = NULL; - DType *wmat_ptr = NULL; - DType *out_ptr = NULL; - Tensor data = - mkl_experimental_direct_get(in_data[conv::kData], s); - Tensor out = - mkl_experimental_direct_get(out_data[conv::kOut], s); - Tensor wmat = - mkl_experimental_direct_get(in_data[conv::kWeight], s); - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(wmat.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - data_ptr = data.dptr_; - wmat_ptr = wmat.dptr_; - out_ptr = out.dptr_; - int status; - void *res_convolutionFwd[dnnResourceNumber]; - res_convolutionFwd[dnnResourceSrc] = - fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]); - res_convolutionFwd[dnnResourceFilter] = - fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]); - if (!param_.no_bias) { - Tensor bias = - mkl_experimental_direct_get(in_data[conv::kBias], s); - res_convolutionFwd[dnnResourceBias] = - fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]); - } - - res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr, - fwd_top_data, out_data[conv::kOut]); - status = dnnExecute(convolutionFwd, res_convolutionFwd); - CHECK_EQ(status, 0) << "Forward convolution failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out_ptr); - } -#endif - } - void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) { - int blob_byte_size = blob_size * sizeof(DType); - *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU()); - memcpy(pws->dptr, src, blob_byte_size); - } - void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) { - DType *dst = reinterpret_cast(dst_); - DType *src = reinterpret_cast(pws->dptr); -#pragma omp parallel for - for (int i = 0; i < blob_size; i++) { - dst[i] += src[i]; - } - if (pws->dptr) - Storage::Get()->Free(*pws); - pws->dptr = NULL; - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - if (param_.kernel.ndim() > 2) { - LOG(FATAL) << "Volume convolution is not implmented in mshadow"; - } - CHECK_EQ(out_grad.size(), 1); - size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); - Stream *s = ctx.get_stream(); - Tensor data = - mkl_experimental_direct_get(in_data[conv::kData], s); - Shape<3> wmat_shape = - Shape3(param_.num_group, - param_.num_filter / param_.num_group, - data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]); - Tensor wmat = - mkl_experimental_direct_get_with_shape( - in_data[conv::kWeight], wmat_shape, s); - Tensor grad = - mkl_experimental_direct_get(out_grad[conv::kOut], s); - Tensor gdata = - mkl_experimental_direct_get(in_grad[conv::kData], s); - Tensor gwmat = - mkl_experimental_direct_get_with_shape( - in_grad[conv::kWeight], wmat_shape, s); - - if (!init_mkldnn_) { - init_mkldnn_ = true; - LayerSetUp(data, grad); - } - int status; - if (req[0]) { - void *res_convolutionBwdData[dnnResourceNumber]; - res_convolutionBwdData[dnnResourceDiffDst] = - bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdData[dnnResourceFilter] = - bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]); - Storage::Handle addtoWorkspace; - if (req[0] == kAddTo) { - // wait mkl support addto mode - AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace); - } - - res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_, - bwdd_bottom_diff, in_grad[conv::kData]); - status = dnnExecute(convolutionBwdData, res_convolutionBwdData); - CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdd_bottom_diff->conversion_needed()) { - bwdd_bottom_diff->convert_from_prv(gdata.dptr_); - } -#endif - if (req[0] == kAddTo) { - if (bwdd_bottom_diff->conversion_needed()) { - bwdd_bottom_diff->convert_from_prv(gdata.dptr_); - } - AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size()); - } - } - if (req[1]) { - void *res_convolutionBwdFilter[dnnResourceNumber]; - - res_convolutionBwdFilter[dnnResourceDiffDst] = - bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdFilter[dnnResourceSrc] = - bwdf_bottom_data->get_converted_prv(data.dptr_, false, - in_data[conv::kData]); - Storage::Handle addtoWorkspace; - if (req[1] == kAddTo) { - // wait mkl support addto mode - AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace); - } - - res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr( - gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]); - status = dnnExecute(convolutionBwdFilter, res_convolutionBwdFilter); - CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdf_filter_diff->conversion_needed()) { - bwdf_filter_diff->convert_from_prv(gwmat.dptr_); - } -#endif - if (req[1] == kAddTo) { - if (bwdf_filter_diff->conversion_needed()) { - bwdf_filter_diff->convert_from_prv(gwmat.dptr_); - } - AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size()); - } - } - if (!param_.no_bias) { - Tensor gbias = - mkl_experimental_direct_get(in_grad[conv::kBias], s); - void *res_convolutionBwdBias[dnnResourceNumber]; - res_convolutionBwdBias[dnnResourceDiffDst] = - bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_, - bwdb_bias_diff, in_grad[conv::kBias]); - status = dnnExecute(convolutionBwdBias, res_convolutionBwdBias); - CHECK_EQ(status, 0) << "Backward Bias failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdb_bias_diff->conversion_needed()) { - bwdb_bias_diff->convert_from_prv(gbias.dptr_); - } -#endif - } - } - - private: - ConvolutionParam param_; - size_t width_, - height_, - width_out_, - height_out_, - kernel_w_, - kernel_h_, - stride_w_, - stride_h_; - int group_, - num_, - num_output_; - size_t channels_; - int pad_w_, - pad_h_; - bool init_mkldnn_; - dnnPrimitive_t convolutionFwd; - dnnPrimitive_t convolutionBwdData; - dnnPrimitive_t convolutionBwdFilter; - dnnPrimitive_t convolutionBwdBias; - /* Fwd step */ - std::shared_ptr > fwd_bottom_data, fwd_top_data, fwd_filter_data, - fwd_bias_data; - /* Bwd data step */ - std::shared_ptr > bwdd_top_diff, bwdd_bottom_diff; - std::shared_ptr > bwdd_filter_data; - /* Bwd filter step */ - std::shared_ptr > bwdf_top_diff, bwdf_filter_diff; - std::shared_ptr > bwdf_bottom_data; - std::shared_ptr > bwdf_filter_diff_iter, bwdf2fwd_filter_diff, - bwdb_bias_diff_iter; - /* Bwd bias step */ - std::shared_ptr > bwdb_top_diff, bwdb_bias_diff; -}; // class ConvolutionOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc deleted file mode 100644 index 507e5498c85b..000000000000 --- a/src/operator/mkl/mkl_cppwrapper.cc +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_cppwrapper.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ - - - -#include "mkl_cppwrapper.h" -#include -#if MXNET_USE_MKL2017 == 1 -#include "mkl_service.h" - -int getMKLBuildDate() { - static int build = 0; - if (build == 0) { - MKLVersion v; - mkl_get_version(&v); - build = atoi(v.Build); - printf("MKL Build:%d\n", build); - } - return build; -} - -bool enableMKLWarnGenerated() { - return false; -} -#endif // MSHADOW_USE_MKL2017 diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h deleted file mode 100644 index 7d66f20ad308..000000000000 --- a/src/operator/mkl/mkl_cppwrapper.h +++ /dev/null @@ -1,1020 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_cppwrapper.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ -#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ - - -#include -#include -#if MXNET_USE_MKL2017 == 1 -#include "mkl_dnn_types.h" -#include "mkl_dnn.h" -#include "mkl_version.h" - - -extern int getMKLBuildDate(); -extern bool enableMKLWarnGenerated(); - - -template inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]); -template <> inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { - return dnnLayoutCreate_F32(pLayout, dimension, size, strides); -} -template <> inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { - return dnnLayoutCreate_F64(pLayout, dimension, size, strides); -} - -template inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type); -template <> inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { - return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type); -} -template <> inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { - return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type); -} - -template inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout); -template <> inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout) { - return dnnLayoutGetMemorySize_F32(layout); -} -template <> inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout) { - return dnnLayoutGetMemorySize_F64(layout); -} - -template inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2); -template <> inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2) { - return dnnLayoutCompare_F32(l1, l2); -} -template <> inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2) { - return dnnLayoutCompare_F64(l1, l2); -} - - -template inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout); -template <> inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout) { - return dnnAllocateBuffer_F32(pPtr, layout); -} -template <> inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout) { - return dnnAllocateBuffer_F64(pPtr, layout); -} - -template inline dnnError_t dnnReleaseBuffer( - void *ptr); -template <> inline dnnError_t dnnReleaseBuffer( - void *ptr) { - return dnnReleaseBuffer_F32(ptr); -} -template <> inline dnnError_t dnnReleaseBuffer( - void *ptr) { - return dnnReleaseBuffer_F64(ptr); -} - -template inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout); -template <> inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout) { - return dnnLayoutDelete_F32(layout); -} -template <> inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout) { - return dnnLayoutDelete_F64(layout); -} - -template inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes); -template <> inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveAttributesCreate_F32(attributes); -} -template <> inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveAttributesCreate_F64(attributes); -} - - -template inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes); -template <> inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes) { - return dnnPrimitiveAttributesDestroy_F32(attributes); -} -template <> inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes) { - return dnnPrimitiveAttributesDestroy_F64(attributes); -} - -template inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes); -template <> inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveGetAttributes_F32(primitive, attributes); -} -template <> inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveGetAttributes_F64(primitive, attributes); -} - -template inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]); -template <> inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecute_F32(primitive, resources); -} -template <> inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecute_F64(primitive, resources); -} - -template inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]); -template <> inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecuteAsync_F32(primitive, resources); -} -template <> inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecuteAsync_F64(primitive, resources); -} - -template inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive); -template <> inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive) { - return dnnWaitFor_F32(primitive); -} -template <> inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive) { - return dnnWaitFor_F64(primitive); -} - -template inline dnnError_t dnnDelete( - dnnPrimitive_t primitive); -template <> inline dnnError_t dnnDelete( - dnnPrimitive_t primitive) { - return dnnDelete_F32(primitive); -} -template <> inline dnnError_t dnnDelete( - dnnPrimitive_t primitive) { - return dnnDelete_F64(primitive); -} - - -template inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to); -template <> inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { - return dnnConversionCreate_F32(pConversion, from, to); -} -template <> inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { - return dnnConversionCreate_F64(pConversion, from, to); -} - - -template inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to); -template <> inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to) { - return dnnConversionExecute_F32(conversion, from, to); -} -template <> inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to) { - return dnnConversionExecute_F64(conversion, from, to); -} - - -template inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForward_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template <> inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForward_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForwardBias_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForwardBias_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardData_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardData_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardFilter_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardFilter_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]); -template <> inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]) { - return dnnConvolutionCreateBackwardBias_F32( - pConvolution, - attributes, - algorithm, - dimension, dstSize); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]) { - return dnnConvolutionCreateBackwardBias_F64( - pConvolution, - attributes, - algorithm, - dimension, dstSize); -} - -template inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForward_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForward_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForwardBias_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForwardBias_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardData_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardData_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardFilter_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardFilter_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]) { - return dnnGroupsConvolutionCreateBackwardBias_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, dstSize); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]) { - return dnnGroupsConvolutionCreateBackwardBias_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, dstSize); -} - -template inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope); -template <> inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateForward_F32( - pRelu, - attributes, - dataLayout, negativeSlope); -} -template <> inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateForward_F64( - pRelu, - attributes, - dataLayout, negativeSlope); -} - -template inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope); -template <> inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateBackward_F32( - pRelu, - attributes, - diffLayout, dataLayout, negativeSlope); -} -template <> inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateBackward_F64( - pRelu, - attributes, - diffLayout, dataLayout, negativeSlope); -} - -template inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k); -template <> inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateForward_F32( - pLrn, - attributes, - dataLayout, kernel_size, alpha, beta, k); -} -template <> inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateForward_F64( - pLrn, - attributes, - dataLayout, kernel_size, alpha, beta, k); -} - - -template inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k); -template <> inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateBackward_F32( - pLrn, - attributes, - diffLayout, dataLayout, kernel_size, alpha, beta, k); -} -template <> inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateBackward_F64( - pLrn, - attributes, - diffLayout, dataLayout, kernel_size, alpha, beta, k); -} - - -template inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateForward_F32( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} -template <> inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateForward_F64( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} - - -template inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateBackward_F32( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} -template <> inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateBackward_F64( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} - -template inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]); -template <> inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]) { - return dnnConcatCreate_F32( - pConcat, - attributes, - N, - src); -} -template <> inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]) { - return dnnConcatCreate_F64( - pConcat, - attributes, - N, - src); -} - - -template inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]); -template <> inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]) { - return dnnSplitCreate_F32( - pSplit, - attributes, - N, - src, - dst); -} -template <> inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]) { - return dnnSplitCreate_F64( - pSplit, - attributes, - N, - src, - dst); -} - -template inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, Dtype *coefficients); -template <> inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, float *coefficients) { - return dnnSumCreate_F32( - pSum, - attributes, - nSummands, - layout, coefficients); -} -template <> inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, double *coefficients) { - return dnnSumCreate_F64( - pSum, - attributes, - nSummands, - layout, coefficients); -} - -template inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags); - -template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateForward_v2_F32( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} -template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateForward_v2_F64( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - - -template inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags); - -template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateBackward_v2_F32( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - -template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateBackward_v2_F64( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - -template inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); -template <> inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForward_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForward_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - -template inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForwardBias_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForwardBias_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - -template inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardData_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardData_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - - - -template inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - - -template inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]); - -template <> inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]) { - return dnnInnerProductCreateBackwardBias_F32(pInnerProduct, - attributes, dimensions, - dstSize); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]) { - return dnnInnerProductCreateBackwardBias_F64(pInnerProduct, - attributes, dimensions, - dstSize); -} -#endif // #MXNET_USE_MKL2017 == 1 -#endif // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h deleted file mode 100644 index 48c931291150..000000000000 --- a/src/operator/mkl/mkl_elementwise_copy-inl.h +++ /dev/null @@ -1,69 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_elementwise-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - - -namespace mxnet { -namespace op { - -template -void MKLIdentityCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (!req[0]) return; -#if MKL_EXPERIMENTAL == 1 - if (op::mkl_prv_data(inputs[0])) { - std::shared_ptr in_data_mem = inputs[0].Mkl_mem_; - // User copy to avoid potential problem - std::shared_ptr > top_data = MKLData::create(); - std::shared_ptr top_mem = outputs[0].Mkl_mem_; - top_data->copy_from(in_data_mem); - top_mem->set_prv_descriptor(top_data); - return; - } -#endif - int in_blob_size = inputs[0].Size(); - int out_blob_size = outputs[0].Size(); - CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match "; - memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType)); -} - - - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h deleted file mode 100644 index d313fd15a5be..000000000000 --- a/src/operator/mkl/mkl_elementwise_sum-inl.h +++ /dev/null @@ -1,117 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_elementwise-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - - -namespace mxnet { -namespace op { -template -static void LayerSetUp(const std::vector > &data, - size_t data_shape_size, - std::shared_ptr > fwd_top_data) { - // Whether to use an asymptotically slower (for >2 inputs) but stabler method - // of computing the gradient for the PROD operation. (No effect for SUM op.) - // stable_prod_grad_ = 1; - size_t dim_src = data_shape_size; - size_t *sizes_src = new size_t[dim_src]; - size_t *strides_src = new size_t[dim_src]; - for (size_t d = 0; d < dim_src; ++d) { - sizes_src[d] = data[0].shape_[dim_src - d - 1]; - strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; - } - - fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src); - delete[] sizes_src; - delete[] strides_src; -} - -template -void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& in_data, - const std::vector& req, - const std::vector& out_data) { - using namespace mshadow; - using namespace mshadow::expr; - if (req[0] == kNullOp) return; - size_t size = in_data.size(); - Stream *s = ctx.get_stream(); - std::vector > data(size); - Tensor out = out_data[0].FlatTo1D(s); - bool in_place_flag = false; - int in_place_idx = 0; - - for (size_t i = 0; i < size; ++i) { - data[i] = in_data[i].FlatTo1D(s); - if (data[i].dptr_ == out.dptr_) { - in_place_idx = i; - in_place_flag = true; - } - } - std::shared_ptr > fwd_top_data = MKLData::create(); - std::vector coeffs_ = std::vector(data.size(), 1); - LayerSetUp(data, 1, fwd_top_data); - - - dnnError_t e; - void *eltwise_res[dnnResourceNumber]; - dnnPrimitive_t sumPrimitive = NULL; - e = dnnSumCreate(&sumPrimitive, NULL, size, fwd_top_data->layout_usr, - &coeffs_[0]); - CHECK_EQ(e, E_SUCCESS); - - eltwise_res[dnnResourceDst] = reinterpret_cast(const_cast(out.dptr_)); - eltwise_res[dnnResourceMultipleSrc] = - reinterpret_cast(reinterpret_cast(in_data[in_place_idx].dptr_)); - for (size_t i = 1; i < size; ++i) { - if (i == in_place_idx) continue; - eltwise_res[dnnResourceMultipleSrc + i] = - reinterpret_cast(reinterpret_cast(in_data[i].dptr_)); - } - - e = dnnExecute(sumPrimitive, eltwise_res); - CHECK_EQ(e, E_SUCCESS); - - if (sumPrimitive != NULL) { - dnnDelete(sumPrimitive); - sumPrimitive = NULL; - } -} - - - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h deleted file mode 100644 index 5e296704b6dd..000000000000 --- a/src/operator/mkl/mkl_fully_connected-inl.h +++ /dev/null @@ -1,192 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_fully_connected-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ -#include -#include -#include -#include "../activation-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLFullyConnectedOp : public Operator { - public: - explicit MKLFullyConnectedOp(const FullyConnectedParam& p, - const std::vector& in_shapes, - const std::vector& out_shapes): - param_(p) { - LayerSetUp(in_shapes, out_shapes); - } - - ~MKLFullyConnectedOp() { - dnnDelete(fullyConnectedFwd); - dnnDelete(fullyConnectedBwdData); - dnnDelete(fullyConnectedBwdFilter); - dnnDelete(fullyConnectedBwdBias); - } - static std::string getName() { - return "MKLFullyConnectedOp"; - } - - private: - void LayerSetUp(const std::vector& in_shapes, - const std::vector& out_shapes) { - const TShape& ishape = in_shapes[fullc::kData]; - - const size_t dim = 4; - const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]}; - const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]}; - const size_t output_channels = param_.num_hidden; - - dnnPrimitiveAttributes_t attributes = NULL; - MKLDNN_CALL(dnnPrimitiveAttributesCreate(&attributes)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnInnerProductCreateForwardBias( - &fullyConnectedFwd, - attributes, - dim, - src_sizes, - output_channels)); - } else { - MKLDNN_CALL(dnnInnerProductCreateForward( - &fullyConnectedFwd, - attributes, - dim, - src_sizes, - output_channels)); - } - MKLDNN_CALL(dnnInnerProductCreateBackwardData( - &fullyConnectedBwdData, - attributes, - dim, - src_sizes, - output_channels)); - MKLDNN_CALL(dnnInnerProductCreateBackwardFilter( - &fullyConnectedBwdFilter, - attributes, - dim, - src_sizes, - output_channels)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnInnerProductCreateBackwardBias( - &fullyConnectedBwdBias, - attributes, - 2, - dst_sizes)); - } - // TODO(minjie): Shouldn't `attributes` be destroyed? - } - - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - void* res_fullyConnected[dnnResourceNumber]; - if (req[fullc::kOut] == kNullOp) return; - CHECK_EQ(req[fullc::kOut], kWriteTo); - CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_data[fullc::kOut].shape_; - - Tensor data; - Tensor out; - - Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1); - - Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1); - Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1); - - data = in_data[fullc::kData].get_with_shape(dshape, s); - out = out_data[fullc::kOut].get_with_shape(odshape, s); - res_fullyConnected[dnnResourceSrc] = - reinterpret_cast(in_data[fullc::kData].dptr_); - res_fullyConnected[dnnResourceDst] = - reinterpret_cast(out_data[fullc::kOut].dptr_); - res_fullyConnected[dnnResourceFilter] = - reinterpret_cast(in_data[fullc::kWeight].dptr_); - if (!param_.no_bias) { - res_fullyConnected[dnnResourceBias] = reinterpret_cast(in_data[fullc::kBias].dptr_); - } - - MKLDNN_CALL(dnnExecute(fullyConnectedFwd, res_fullyConnected)); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - void* res_fullyConnected[dnnResourceNumber]; - CHECK_EQ(out_grad.size(), 1); - const size_t expected = param_.no_bias ? 2 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - res_fullyConnected[dnnResourceSrc] = - reinterpret_cast(in_data[fullc::kData].dptr_); - res_fullyConnected[dnnResourceFilter] = - reinterpret_cast(in_data[fullc::kWeight].dptr_); - - res_fullyConnected[dnnResourceDiffDst] = - reinterpret_cast(out_grad[fullc::kOut].dptr_); - res_fullyConnected[dnnResourceDiffSrc] = - reinterpret_cast(in_grad[fullc::kData].dptr_); - res_fullyConnected[dnnResourceDiffFilter] = - reinterpret_cast(in_grad[fullc::kWeight].dptr_); - if (!param_.no_bias) { - res_fullyConnected[dnnResourceDiffBias] = - reinterpret_cast(in_grad[fullc::kBias].dptr_); - } - MKLDNN_CALL(dnnExecute(fullyConnectedBwdFilter, res_fullyConnected)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnExecute(fullyConnectedBwdBias, res_fullyConnected)); - } - MKLDNN_CALL(dnnExecute(fullyConnectedBwdData, res_fullyConnected)); - } - - private: - dnnPrimitive_t fullyConnectedFwd{nullptr}; - dnnPrimitive_t fullyConnectedBwdData{nullptr}; - dnnPrimitive_t fullyConnectedBwdFilter{nullptr}; - dnnPrimitive_t fullyConnectedBwdBias{nullptr}; - const FullyConnectedParam param_; -}; // class MKLFullyConnectedOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h deleted file mode 100644 index 90dfad50fa62..000000000000 --- a/src/operator/mkl/mkl_lrn-inl.h +++ /dev/null @@ -1,265 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_lrn-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLLRNOp : public Operator { - public: - static std::string getName() { - return "MKLLRNOp"; - } - - explicit MKLLRNOp(LRNParam param) : - lrnFwd(static_cast(NULL)), - lrnBwd(static_cast(NULL)), - lrn_buffer_(NULL) { - this->param_ = param; - fwd_top_data_ = MKLData::create(); - fwd_bottom_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - bwd_bottom_diff_ = MKLData::create(); - init_mkldnn_ = false; - } - - virtual ~MKLLRNOp() { - if (lrnFwd != NULL) { - dnnDelete(lrnFwd); - lrnFwd = NULL; - } - if (lrnBwd != NULL) { - dnnDelete(lrnBwd); - lrnBwd = NULL; - } - dnnReleaseBuffer(lrn_buffer_); - } - - private: - void LayerSetup(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - size_ = param_.nsize; - CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size"; - - alpha_ = param_.alpha; - beta_ = param_.beta; - k_ = param_.knorm; - size_t dim = 4, sizes[4], strides[4]; - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - sizes[0] = width_; - sizes[1] = height_; - sizes[2] = channels_; - sizes[3] = num_; - - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - - fwd_bottom_data_->name = "fwd_bottom_data_ @ " + getName(); - fwd_top_data_->name = "fwd_top_data_ @ " + getName(); - bwd_top_diff_->name = "bwd_top_diff_ @ " + getName(); - bwd_bottom_diff_->name = "bwd_bottom_diff_ @ " + getName(); - - fwd_bottom_data_->create_user_layout(dim, sizes, strides); - fwd_top_data_->create_user_layout(dim, sizes, strides); - bwd_bottom_diff_->create_user_layout(dim, sizes, strides); - bwd_top_diff_->create_user_layout(dim, sizes, strides); - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; - Stream *s = ctx.get_stream(); - Tensor data = mkl_experimental_direct_get( - in_data[lrn_enum::kData], s); - Tensor out = mkl_experimental_direct_get( - out_data[lrn_enum::kOut], s); - if (!init_mkldnn_) { - LayerSetup(data, out); - init_mkldnn_ = true; - } - - const void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[lrn_enum::kData])); -#endif -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - if (lrnFwd == NULL) { - std::shared_ptr bottom_data_mem = - in_data[lrn_enum::kData].Mkl_mem_; - std::shared_ptr bottom_prv_descriptor = - bottom_data_mem->get_prv_descriptor(); - CHECK_EQ(bottom_prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_descriptor); - CHECK(mem_descr != nullptr); - fwd_bottom_data_ = mem_descr; - - dnnError_t e; - dnnLayout_t lrn_buffer_l = NULL; - - e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_int, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst); - - e = dnnLRNCreateBackward(&lrnBwd, NULL, - fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - e = dnnLayoutCreateFromPrimitive( - &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); - CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&lrn_buffer_), lrn_buffer_l); - CHECK_EQ(e, E_SUCCESS); - dnnLayoutDelete(lrn_buffer_l); - - bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst); - bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc); - } - } -#endif - if (bottom_data == NULL) { - if (lrnFwd == NULL) { - dnnError_t e; - dnnLayout_t lrn_buffer_l = NULL; - e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_usr, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - e = dnnLayoutCreateFromPrimitive( - &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); - CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&lrn_buffer_), lrn_buffer_l); - CHECK_EQ(e, E_SUCCESS); - dnnLayoutDelete(lrn_buffer_l); - - e = dnnLRNCreateBackward(&lrnBwd, NULL, - fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - } - bottom_data = data.dptr_; - } - - dnnError_t e; - void* lrn_res[dnnResourceNumber]; - lrn_res[dnnResourceSrc] = const_cast(bottom_data); - - lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( - out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]); - lrn_res[dnnResourceWorkspace] = lrn_buffer_; - e = dnnExecute(lrnFwd, lrn_res); - CHECK_EQ(e, E_SUCCESS); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 2); - Stream *s = ctx.get_stream(); - Tensor grad = mkl_experimental_direct_get( - out_grad[lrn_enum::kOut], s); - Tensor data = mkl_experimental_direct_get( - in_data[lrn_enum::kData], s); - Tensor grad_in = mkl_experimental_direct_get( - in_grad[lrn_enum::kData], s); - dnnError_t e; - void* lrn_res[dnnResourceNumber]; - lrn_res[dnnResourceDiffDst] = - bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]); - lrn_res[dnnResourceWorkspace] = lrn_buffer_; - lrn_res[dnnResourceSrc] = - fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]); - - lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( - grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]); - e = dnnExecute(lrnBwd, lrn_res); - CHECK_EQ(e, E_SUCCESS); - } - - private: - LRNParam param_; - int size_; - int pre_pad_; - DType alpha_; - DType beta_; - DType k_; - int num_; - int channels_; - int height_; - int width_; - bool init_mkldnn_; - - private: - dnnPrimitive_t lrnFwd, lrnBwd; - std::shared_ptr > fwd_top_data_; - std::shared_ptr > fwd_bottom_data_; - - std::shared_ptr > bwd_top_diff_; - std::shared_ptr > bwd_bottom_diff_; - - DType *lrn_buffer_; -}; // class LocalResponseNormOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ - diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h deleted file mode 100644 index 71af10254b2a..000000000000 --- a/src/operator/mkl/mkl_memory-inl.h +++ /dev/null @@ -1,137 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ - - -#include -#include -#include -#include "mkl_cppwrapper.h" - -namespace mxnet { - -template -struct MKLMemoryDescriptorBase : public PrvMemDescr, - public std::enable_shared_from_this > { - MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL), - convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL), - name("UNKNOWN"), internal_ptr(NULL) {} - virtual ~MKLMemoryDescriptorBase() { - dnnLayoutDelete(layout_usr); - dnnLayoutDelete(layout_int); - if (internal_ptr != NULL) { - dnnReleaseBuffer(internal_ptr); - internal_ptr = NULL; - } - if (convert_to_int != NULL) { - dnnDelete(convert_to_int); - convert_to_int = NULL; - } - if (convert_from_int != NULL) { - dnnDelete(convert_from_int); - convert_from_int = NULL; - } - if (convert_prv2prv != NULL) { - dnnDelete(convert_prv2prv); - convert_prv2prv = NULL; - } - } - std::shared_ptr > get_shared_ptr() { - return this->shared_from_this(); - } - - dnnLayout_t layout_usr; - dnnLayout_t layout_int; - dnnPrimitive_t convert_to_int; - dnnPrimitive_t convert_from_int; - dnnPrimitive_t convert_prv2prv; - std::shared_ptr > descr_prv2prv_conversion; - - - std::string name; // for debugging purposes - void allocate() { - if (internal_ptr == NULL) { - int status = dnnAllocateBuffer( - reinterpret_cast(&internal_ptr), layout_int); - CHECK_EQ(status, E_SUCCESS) - << "Failed internal_ptr memory allocation with status " - << status << "\n"; - } - } - virtual void* prv_ptr(bool allocate_when_uninit = true) { - if (internal_ptr == NULL && allocate_when_uninit) - allocate(); - return internal_ptr; - } - inline bool conversion_needed() { - return (convert_to_int != NULL); - } - void create_conversions(); - void create_internal_layout(const dnnPrimitive_t primitive, - dnnResourceType_t type); - void create_user_layout(size_t dimension, const size_t size[], - const size_t strides[]); - void create_layouts( - const dnnPrimitive_t primitive, dnnResourceType_t type, - size_t dimension, const size_t size[], const size_t strides[]); - - virtual PrvDescrType get_descr_type() { - return PRV_DESCR_MKL2017; - } - virtual size_t prv_size() { - return dnnLayoutGetMemorySize(layout_int); - } - virtual size_t prv_count() { - return dnnLayoutGetMemorySize(layout_int) / sizeof(DType); - } - virtual void convert_from_prv(void* cpu_ptr); - virtual void convert_to_prv(void* cpu_ptr); - virtual bool layout_compare(std::shared_ptr other); - virtual void convert_from_other(std::shared_ptr other); - protected: - DType* internal_ptr; -}; - -template -struct MKLMemoryDescriptor : MKLMemoryDescriptorBase { - // The last get_converted_prv() argument is a hack for reusing - // in backward a conversion done already in the forward direction. - DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr, - const TBlob &blob); - void* get_output_ptr(DType *data_ptr, std::shared_ptr > self_ptr, - const TBlob &blob, bool in_place = false); - bool copy_from(std::shared_ptr dnn_chunk); - MKLMemoryDescriptor() {} -}; - -template struct MKLData : MKLMemoryDescriptor { - static std::shared_ptr > create() { - return std::make_shared >(); - } -}; - -template struct MKLData; -template struct MKLData; - -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc deleted file mode 100644 index 7682fe1c1f37..000000000000 --- a/src/operator/mkl/mkl_memory.cc +++ /dev/null @@ -1,291 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#include "../operator_common.h" - -#if MXNET_USE_MKL2017 == 1 -#include -#include "mkl_memory-inl.h" -#include "mkl_util-inl.h" - -namespace mxnet { - -template -void MKLMemoryDescriptorBase::create_conversions() { - int status; - if (this->convert_from_int) { - status = dnnDelete(this->convert_from_int); - CHECK_EQ(status, E_SUCCESS); - this->convert_from_int = NULL; - } - if (this->convert_to_int) { - status = dnnDelete(this->convert_to_int); - CHECK_EQ(status, E_SUCCESS); - this->convert_to_int = NULL; - } - if (layout_int - && !dnnLayoutCompare(layout_usr, layout_int)) { - CHECK(layout_usr); - status = dnnConversionCreate(&convert_to_int, layout_usr, - layout_int); - CHECK_EQ(status, E_SUCCESS) - << "Failed creation convert_to_int with status " - << status << " for buffer: " << this->name << "\n"; - status = dnnConversionCreate(&convert_from_int, layout_int, - layout_usr); - CHECK_EQ(status, E_SUCCESS) - << "Failed creation convert_from_int with status " - << status << " for buffer: " << this->name << "\n"; - } -} - -template -void MKLMemoryDescriptorBase::create_internal_layout( - const dnnPrimitive_t primitive, dnnResourceType_t type) { - int status; - if (this->layout_int) { - status = dnnLayoutDelete(this->layout_int); - CHECK_EQ(status, E_SUCCESS); - } - status = dnnLayoutCreateFromPrimitive( - &this->layout_int, primitive, type); - CHECK_EQ(status, E_SUCCESS) - << "Failed dnnLayoutCreateFromPrimitive with status " - << status << " for buffer: " << this->name << "\n"; - - if (this->layout_usr) - this->create_conversions(); -} - -template -void MKLMemoryDescriptorBase::create_user_layout( - size_t dimension, const size_t size[], const size_t strides[]) { - int status; - if (this->layout_usr) { - status = dnnLayoutDelete(this->layout_usr); - CHECK_EQ(status, E_SUCCESS); - } - - status = dnnLayoutCreate( - &this->layout_usr, dimension, size, strides); - CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status " - << status << " for buffer: " << this->name << "\n"; - - if (this->layout_int) - this->create_conversions(); -} - -template -void MKLMemoryDescriptorBase::create_layouts( - const dnnPrimitive_t primitive, dnnResourceType_t type, - size_t dimension, const size_t size[], const size_t strides[]) { - this->create_internal_layout(primitive, type); - this->create_user_layout(dimension, size, strides); -} - - -template -void MKLMemoryDescriptorBase::convert_from_prv(void* cpu_ptr) { - CHECK(cpu_ptr); - CHECK(this->convert_from_int); - int status; - void *convert_resources[dnnResourceNumber]; - - convert_resources[dnnResourceFrom] = this->prv_ptr(); - convert_resources[dnnResourceTo] = cpu_ptr; - status = dnnExecute(this->convert_from_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; -} - -template -void MKLMemoryDescriptorBase::convert_to_prv(void* cpu_ptr) { - CHECK(cpu_ptr); - CHECK(this->convert_to_int); - int status; - void *convert_resources[dnnResourceNumber]; - - convert_resources[dnnResourceFrom] = cpu_ptr; - convert_resources[dnnResourceTo] = this->prv_ptr(); - status = dnnExecute(this->convert_to_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; -} - - -template -bool MKLMemoryDescriptorBase::layout_compare( - std::shared_ptr other) { - CHECK_EQ(other->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr >other_descr = - std::static_pointer_cast > - (other); - - if (dnnLayoutCompare(other_descr->layout_int, - this->layout_int)) - return true; - else - return false; -} - -template -void MKLMemoryDescriptorBase::convert_from_other( - std::shared_ptr other) { - std::shared_ptr > other_descr = - std::static_pointer_cast > - (other); - - int status; - dnnPrimitive_t convert; - status = dnnConversionCreate(&convert, - other_descr->layout_int, this->layout_int); - - void *convert_resources[dnnResourceNumber]; - convert_resources[dnnResourceFrom] = other_descr->prv_ptr(); - convert_resources[dnnResourceTo] = this->prv_ptr(); - status = dnnExecute(convert, convert_resources); - CHECK_EQ(status, 0) << "Conversion from other failed with status " - << status; - - dnnDelete(convert); -} - - -template -Dtype* MKLMemoryDescriptor::get_converted_prv( - Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) { - Dtype* prv_ptr = NULL; - std::shared_ptr dnn_chunk = NULL; -#if MKL_EXPERIMENTAL == 1 - dnn_chunk = blob.Mkl_mem_; -#endif -#if MKL_EXPERIMENTAL == 1 - if (dnn_chunk != NULL) - prv_ptr = static_cast(dnn_chunk->prv_data()); -#endif - - if (this->convert_to_int != NULL) { -#if MKL_EXPERIMENTAL == 1 - int status; - void *convert_resources[dnnResourceNumber]; -#endif - if (prv_ptr == NULL) { - this->allocate(); - this->convert_to_prv(cpu_ptr); -#if MKL_EXPERIMENTAL == 1 - if (set_prv_ptr) { - dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); - } -#endif - return this->internal_ptr; - } -#if MKL_EXPERIMENTAL == 1 - if (prv_ptr != NULL) { - std::shared_ptr > current_descr = - op::mkl_get_mem_desc(dnn_chunk); - if (!dnnLayoutCompare(current_descr->layout_int, - this->layout_int)) { - if (this->convert_prv2prv) { - CHECK_EQ(dnnLayoutCompare( - this->descr_prv2prv_conversion->layout_int, - this->layout_int), 0); - status = 0; - } else { - status = dnnConversionCreate(&this->convert_prv2prv, - current_descr->layout_int, this->layout_int); - if (status == 0) - this->descr_prv2prv_conversion = current_descr; - } - if (status != 0) { - this->allocate(); - convert_resources[dnnResourceFrom] = cpu_ptr; - convert_resources[dnnResourceTo] = - reinterpret_cast(this->internal_ptr); - status = dnnExecute(this->convert_to_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion failed with status " << status; - } else { - this->allocate(); - convert_resources[dnnResourceFrom] = reinterpret_cast(prv_ptr); - convert_resources[dnnResourceTo] = - reinterpret_cast(this->internal_ptr); - status = dnnExecute(this->convert_prv2prv, convert_resources); - CHECK_EQ(status, 0) << "Conversion failed with status " << status; - } - if (set_prv_ptr) { - dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); - } - return this->internal_ptr; - } else if (current_descr.get() != this) { - // MKL_DLOG(INFO) << "layout OK " - // << current_descr->name << " == " << this->name; - } - } -#endif - return const_cast(prv_ptr); - } else { - if (prv_ptr != NULL) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr > other_descr = - std::static_pointer_cast > - (dnn_chunk->prv_descriptor_); - dnn_chunk->check_and_prv_to_cpu(cpu_ptr); -#endif - // printf("get_converted_prv release %s\n", other_descr->name.c_str()); - } - } - return cpu_ptr; -} - -template -void* MKLMemoryDescriptor::get_output_ptr(Dtype *data_ptr, - std::shared_ptr > self_ptr, const TBlob &blob, bool in_place) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr dnn_chunk = blob.Mkl_mem_; -#endif - if (this->conversion_needed()) { - void * prv_ptr = this->prv_ptr(); -#if MKL_EXPERIMENTAL == 1 - if (!in_place) { - dnn_chunk->set_prv_descriptor(self_ptr); - } else { - Dtype * blob_prv = op::mkl_prv_data(blob); - if (blob_prv != NULL) - return blob_prv; - } -#endif - return prv_ptr; - } else { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr > other_descr = - std::static_pointer_cast > - (dnn_chunk->prv_descriptor_); - dnn_chunk->check_and_prv_to_cpu(data_ptr); -#endif - return data_ptr; - } -} - -template class MKLMemoryDescriptor; -template class MKLMemoryDescriptor; - -template class MKLMemoryDescriptorBase; -template class MKLMemoryDescriptorBase; -} // namespace mxnet -#endif diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h deleted file mode 100644 index 13f1fd27b12b..000000000000 --- a/src/operator/mkl/mkl_memory.h +++ /dev/null @@ -1,123 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_ -#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_ - -#include -#include -#include - - -namespace mxnet { -// Base class -struct PrvMemDescr { - virtual void convert_from_prv(void* cpu_ptr) = 0; - virtual void convert_to_prv(void* cpu_ptr) = 0; - virtual void convert_from_other(std::shared_ptr other) = 0; - virtual void* prv_ptr(bool allocate_when_uninit = true) = 0; - // returns true for matching layouts - virtual bool layout_compare(std::shared_ptr other) = 0; - virtual size_t prv_count() = 0; - virtual size_t prv_size() = 0; - // This might help using prv_ptr_ by different accelerators/engines - enum PrvDescrType { - PRV_DESCR_MKL2017, - PRV_DESCR_MKLDNN - }; - virtual PrvDescrType get_descr_type() = 0; -}; - -#if MKL_EXPERIMENTAL == 1 -// Currently HEAD_AT_PRV do not free CPU data -enum SyncedHead { - HEAD_AT_CPU, - HEAD_AT_PRV, -}; -struct MKLMemHolder { - SyncedHead head_; - std::shared_ptr prv_descriptor_; - bool b_disable_prv_2_cpu; - bool b_eager_mode; - void disable_prv_2_cpu(bool flag) { - b_disable_prv_2_cpu = flag; - } - void set_eager_mode(bool eager_mode) { - b_eager_mode = eager_mode; - } - void set_prv_descriptor(std::shared_ptr descriptor, bool same_data = false) { - head_ = HEAD_AT_PRV; - prv_descriptor_ = descriptor; - } - std::shared_ptr get_prv_descriptor() { - return prv_descriptor_; - } - bool head_at_prv() { - return (head_ == HEAD_AT_PRV) ? true : false; - } - void* prv_data(bool allocate_when_uninit = true) { - if (head_ != HEAD_AT_PRV) { - return NULL; - } - if (prv_descriptor_ == NULL) { - LOG(FATAL) << " prv_descriptor_ is NULL"; - } - CHECK(prv_descriptor_.get()); - return reinterpret_cast(prv_descriptor_->prv_ptr(allocate_when_uninit)); - } - - int prv_count() { - if (head_ != HEAD_AT_PRV) { - return 0; - } - if (prv_descriptor_ == NULL) { - LOG(FATAL) << " prv_descriptor_ is NULL"; - } - CHECK(prv_descriptor_.get()); - return prv_descriptor_->prv_count(); - } - static std::shared_ptr create() { - return std::make_shared(); - } - void check_and_prv_to_cpu(void *dptr_) { - if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) { - CHECK(prv_descriptor_ != nullptr); - prv_descriptor_->convert_from_prv(dptr_); - // Because operator use CPU & maybe change it, change to CPU Flag - head_ = HEAD_AT_CPU; - } - if (b_disable_prv_2_cpu) { - b_disable_prv_2_cpu = false; - } - } - MKLMemHolder() : - head_(HEAD_AT_CPU), prv_descriptor_(nullptr), - b_disable_prv_2_cpu(false), b_eager_mode(false) {} -}; -#else -struct MKLMemHolder { - public: - virtual std::shared_ptr get_prv_descriptor() = 0; -}; -#endif - -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_H_ diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h deleted file mode 100644 index bc3fcee317f2..000000000000 --- a/src/operator/mkl/mkl_pooling-inl.h +++ /dev/null @@ -1,357 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_pooling-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ - -#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ -#include -#include -#include -#include "../operator_common.h" -#include "../pooling-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - - -template -class MKLPoolingOp : public Operator { - public: - static std::string getName() { - return "MKLPoolingOp"; - } - explicit MKLPoolingOp(PoolingParam p) { - poolingFwd = static_cast(NULL); - poolingBwd = static_cast(NULL); - max_idx_data = static_cast(NULL); - fwd_top_data = MKLData::create(); - fwd_bottom_data = MKLData::create(); - bwd_top_diff = MKLData::create(); - bwd_bottom_diff = MKLData::create(); - this->param_ = p; - init_mkldnn_ = false; - } - virtual ~MKLPoolingOp() { - if (poolingFwd != NULL) { - dnnDelete(poolingFwd); - poolingFwd = NULL; - } - if (poolingBwd != NULL) { - dnnDelete(poolingBwd); - poolingBwd = NULL; - } - if (max_idx_data != NULL) { - dnnReleaseBuffer(max_idx_data); - max_idx_data = NULL; - } - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - global_pooling_ = param_.global_pool; - if (global_pooling_) { - kernel_h_ = height_; - kernel_w_ = width_; - } else { - kernel_h_ = param_.kernel[0]; - kernel_w_ = param_.kernel[1]; - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - pad_h_ = param_.pad[0]; - pad_w_ = param_.pad[1]; - if (global_pooling_) { - stride_h_ = stride_w_ = 1; - } else { - stride_h_ = param_.stride[0]; - stride_w_ = param_.stride[1]; - } - if (global_pooling_) { - CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - if (pad_h_ != 0 || pad_w_ != 0) { - CHECK(param_.pool_type == pool_enum::kAvgPooling - || param_.pool_type == pool_enum::kMaxPooling) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_h_, kernel_h_); - CHECK_LT(pad_w_, kernel_w_); - } - pooled_height_ = out.shape_[2]; - pooled_width_ = out.shape_[3]; - - size_t dim = 4; - size_t src_sizes[4], src_strides[4]; - size_t dst_sizes[4], dst_strides[4]; - src_sizes[0] = width_; - src_sizes[1] = height_; - src_sizes[2] = channels_; - src_sizes[3] = num_; - src_strides[0] = 1; - src_strides[1] = src_sizes[0]; - src_strides[2] = src_sizes[0] * src_sizes[1]; - src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2]; - dst_sizes[0] = pooled_width_; - dst_sizes[1] = pooled_height_; - dst_sizes[2] = src_sizes[2]; - dst_sizes[3] = src_sizes[3]; - dst_strides[0] = 1; - dst_strides[1] = dst_sizes[0]; - dst_strides[2] = dst_sizes[0] * dst_sizes[1]; - dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2]; - src_offset[0] = -pad_w_; - src_offset[1] = -pad_h_; - src_offset[2] = -pad_w_; - src_offset[3] = -pad_h_; - kernel_stride[0] = stride_w_; - kernel_stride[1] = stride_h_; - kernel_size[0] = kernel_w_; - kernel_size[1] = kernel_h_; - - // Names are for debugging only - fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); - fwd_top_data->name = "fwd_top_data @ " + getName(); - bwd_top_diff->name = "bwd_top_diff @ " + getName(); - bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); - - fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides); - fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides); - bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides); - bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides); - - // Primitives will be allocated during the first fwd pass - poolingFwd = NULL; - poolingBwd = NULL; - max_idx_data = NULL; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - if (param_.kernel.ndim() >= 3) { - LOG(FATAL) << "Not implmented"; - } - Tensor data = mkl_experimental_direct_get( - in_data[pool_enum::kData], s); - Tensor out = mkl_experimental_direct_get( - out_data[pool_enum::kOut], s); - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - auto first_pass = false; - if (poolingFwd == NULL) first_pass = true; - - dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax; - - switch (param_.pool_type) { - case pool_enum::kMaxPooling: - algorithm = dnnAlgorithmPoolingMax; - break; - case pool_enum::kAvgPooling: - algorithm = dnnAlgorithmPoolingAvgIncludePadding; - - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - - dnnError_t status; - void* pooling_res[dnnResourceNumber]; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[pool_enum::kData])); -#endif - dnnBorder_t border_type = dnnBorderZerosAsymm; - switch (param_.pooling_convention) { - case pool_enum::kFull: - border_type = dnnBorderZeros; - break; - case pool_enum::kValid: - border_type = dnnBorderZerosAsymm; - break; - default: - border_type = dnnBorderZerosAsymm; - break; - } - if (NULL == bottom_data) { - bottom_data = data.dptr_; - if (NULL == poolingFwd) { - status = dnnPoolingCreateForward(&poolingFwd, NULL, - algorithm, fwd_bottom_data->layout_usr, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - // Now create poolingBwd - status = dnnPoolingCreateBackward(&poolingBwd, NULL, - algorithm, fwd_bottom_data->layout_usr, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - } - } -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - if (NULL == poolingFwd) { - std::shared_ptr bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_; - std::shared_ptr bottom_prv_descriptor = - bottom_data_mem->get_prv_descriptor(); - CHECK_EQ(bottom_prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_descriptor); - CHECK(mem_descr != nullptr); - fwd_bottom_data = mem_descr; - - status = dnnPoolingCreateForward(&poolingFwd, NULL, - algorithm, fwd_bottom_data->layout_int, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); - - // Now create poolingBwd - status = dnnPoolingCreateBackward(&poolingBwd, NULL, - algorithm, fwd_bottom_data->layout_int, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst); - bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc); - } - } -#endif - - if (first_pass) { - dnnLayout_t max_idx_datal = NULL; - status = dnnLayoutCreateFromPrimitive( - &max_idx_datal, poolingFwd, dnnResourceWorkspace); - CHECK_EQ(status, E_SUCCESS); - status = dnnAllocateBuffer(reinterpret_cast(&max_idx_data), max_idx_datal); - CHECK_EQ(status, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc); - fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); - bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst); - bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc); -#endif - dnnLayoutDelete(max_idx_datal); - first_pass = false; - } - pooling_res[dnnResourceSrc] = bottom_data; - pooling_res[dnnResourceWorkspace] = max_idx_data; - - pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr( - out.dptr_, fwd_top_data, out_data[pool_enum::kOut]); - status = dnnExecute(poolingFwd, pooling_res); - CHECK_EQ(status, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out.dptr_); - } -#endif - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - if (!req[0]) { - return; - } - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - CHECK_EQ(req.size(), 1); - CHECK_EQ(in_grad.size(), 1); - if (param_.kernel.ndim() >= 3) { - LOG(FATAL) << "Not implmented"; - } - Stream *s = ctx.get_stream(); - Tensor grad = mkl_experimental_direct_get( - out_grad[pool_enum::kOut], s); - Tensor input_grad = mkl_experimental_direct_get( - in_grad[pool_enum::kData], s); - dnnError_t e; - void* pooling_res[dnnResourceNumber]; - pooling_res[dnnResourceWorkspace] = reinterpret_cast(max_idx_data); - - pooling_res[dnnResourceDiffDst] = - bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]); - - pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr( - input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]); - e = dnnExecute(poolingBwd, pooling_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff->conversion_needed()) { - bwd_bottom_diff->convert_from_prv(input_grad.dptr_); - } -#endif - } - - private: - PoolingParam param_; - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int pad_h_, pad_w_; - int channels_, num_; - int height_, width_; - int pooled_height_, pooled_width_; - bool global_pooling_; - - private: - size_t kernel_size[2], - kernel_stride[4]; - int src_offset[4]; // 2*(dimension-2) - dnnPrimitive_t poolingFwd, poolingBwd; - DType *max_idx_data; - - std::shared_ptr > fwd_top_data; - std::shared_ptr > fwd_bottom_data; - std::shared_ptr > bwd_top_diff; - std::shared_ptr > bwd_bottom_diff; - bool init_mkldnn_; -}; // class MKLPoolingOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h deleted file mode 100644 index 8d7ab5e1e2db..000000000000 --- a/src/operator/mkl/mkl_relu-inl.h +++ /dev/null @@ -1,272 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_relu-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLReluOp : public Operator { - public: - static std::string getName() { - return "MKLReluOp"; - } - MKLReluOp(): - reluFwd_(NULL), - reluBwd_(NULL) { - init_mkldnn_ = false; - fwd_top_data_ = MKLData::create(); - fwd_bottom_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - bwd_bottom_diff_ = MKLData::create(); - } - - ~MKLReluOp() { - if (reluFwd_ != NULL) { - dnnDelete(reluFwd_); - reluFwd_ = NULL; - } - if (reluBwd_ != NULL) { - dnnDelete(reluBwd_); - reluBwd_ = NULL; - } - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - size_t dim = 4; - size_t *sizes = new size_t[dim]; - size_t *strides = new size_t[dim]; - for (size_t d = 0; d < dim; ++d) { - (sizes)[d] = data.shape_[dim - 1 - d]; - (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1]; - } - // Names are for debugging only - fwd_bottom_data_->name = "fwd_bottom_data @ " + getName(); - fwd_top_data_->name = "fwd_top_data @ " + getName(); - bwd_bottom_diff_->name = "bwd_bottom_diff @ " + getName(); - bwd_top_diff_->name = "bwd_top_diff @ " + getName(); - fwd_bottom_data_->create_user_layout(dim, (sizes), (strides)); - fwd_top_data_->create_user_layout(dim, (sizes), (strides)); - bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides)); - bwd_top_diff_->create_user_layout(dim, (sizes), (strides)); - delete[] sizes; - delete[] strides; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - if (in_data[activation::kData].ndim() == 1) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else if (in_data[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else if (in_data[activation::kData].ndim() == 3) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], - in_data[activation::kData].shape_[2], 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[activation::kData], s); - out = mkl_experimental_direct_get(out_data[activation::kOut], s); - } - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[activation::kData])); -#endif -#if MKL_EXPERIMENTAL == 1 - if (bottom_data != NULL) { - if (reluFwd_ == NULL) { - std::shared_ptr > mem_descr = - mkl_get_mem_desc(in_data[activation::kData].Mkl_mem_); - DType negative_slope = 0; - dnnError_t e; - e = dnnReLUCreateForward(&reluFwd_, NULL, mem_descr->layout_int, - negative_slope); - CHECK_EQ(e, E_SUCCESS); - e = dnnReLUCreateBackward(&reluBwd_, NULL, mem_descr->layout_int, - mem_descr->layout_int, negative_slope); - CHECK_EQ(e, E_SUCCESS); - - fwd_bottom_data_ = mem_descr; - fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst); - bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst); - bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc); - } - } -#endif - if (bottom_data == NULL) { - bottom_data = data.dptr_; - if (reluFwd_ == NULL) { - dnnError_t e; - DType negative_slope = 0; - e = dnnReLUCreateForward(&reluFwd_, NULL, - fwd_bottom_data_->layout_usr, negative_slope); - CHECK_EQ(e, E_SUCCESS); - e = dnnReLUCreateBackward(&reluBwd_, NULL, - fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, - negative_slope); - CHECK_EQ(e, E_SUCCESS); - } - } - dnnError_t e; - void* relu_res[dnnResourceNumber]; - relu_res[dnnResourceSrc] = bottom_data; - - relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( - out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_)); - e = dnnExecute(reluFwd_, relu_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data_->conversion_needed()) { - fwd_top_data_->convert_from_prv(out.dptr_); - } -#endif - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - if (!req[0]) { - return; - } - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1); - Stream *s = ctx.get_stream(); - Tensor m_out_grad; - Tensor m_out_data; - Tensor m_in_grad; - - if (out_grad[activation::kOut].ndim() == 1) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else if (out_grad[activation::kOut].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], - out_grad[activation::kOut].shape_[1], 1, 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else if (out_grad[activation::kOut].ndim() == 3) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], - out_grad[activation::kOut].shape_[1], - out_grad[activation::kOut].shape_[2], 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else { - m_out_grad = mkl_experimental_direct_get(out_grad[activation::kOut], s); - m_out_data = mkl_experimental_direct_get(out_data[activation::kOut], s); - m_in_grad = mkl_experimental_direct_get(in_grad[activation::kData], s); - } - dnnError_t e; - void* relu_res[dnnResourceNumber]; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = reinterpret_cast(mkl_prv_data(out_data[activation::kOut])); -#endif - if (NULL == bottom_data) { - bottom_data = reinterpret_cast(const_cast(m_out_data.dptr_)); - } - relu_res[dnnResourceSrc] = bottom_data; - relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_, - true, out_grad[activation::kOut]); - relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( - m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]); - e = dnnExecute(reluBwd_, relu_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff_->conversion_needed()) { - bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_); - } -#endif - } - - private: - bool init_mkldnn_; - std::shared_ptr > fwd_top_data_; - std::shared_ptr > fwd_bottom_data_; - std::shared_ptr > bwd_top_diff_; - std::shared_ptr > bwd_bottom_diff_; - dnnPrimitive_t reluFwd_, reluBwd_; -}; // class MKLReluOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h deleted file mode 100644 index 4ad786a2ce93..000000000000 --- a/src/operator/mkl/mkl_util-inl.h +++ /dev/null @@ -1,110 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_util-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ -#include -#define MKLDNN_CALL(func) \ - { \ - dnnError_t status = (func); \ - CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ")."; \ - } - - -namespace mxnet { -namespace op { - -#if MKL_EXPERIMENTAL == 1 - template - inline DType * mkl_prv_data(const TBlob &b) { - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - return reinterpret_cast(bottom_data_mem->prv_data()); - } - return NULL; - } - - template - inline int mkl_prv_count(const TBlob &b) { - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - return bottom_data_mem->prv_count(); - } - return 0; - } -#endif - inline void mkl_set_priv_flag(const TBlob &b) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - bottom_data_mem->disable_prv_2_cpu(true); - } -#endif - } -#if MKL_EXPERIMENTAL == 1 - template - inline std::shared_ptr > mkl_get_mem_desc( - const std::shared_ptr data_mem) { - std::shared_ptr prv_descriptor = - data_mem->get_prv_descriptor(); - CHECK_EQ(prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast> - (prv_descriptor); - CHECK(mem_descr != NULL); - return mem_descr; - } -#endif - template - inline mshadow::Tensor mkl_experimental_direct_get( - const TBlob &b, mshadow::Stream *s) { - mkl_set_priv_flag(b); - return b.get(s); - } - template - inline mshadow::Tensor mkl_experimental_direct_get_with_shape( - const TBlob &b, const mshadow::Shape &shape, mshadow::Stream *s) { - mkl_set_priv_flag(b); - return b.get_with_shape(shape, s); - } -} // namespace op -#if MKL_EXPERIMENTAL == 1 -inline void mkl_tblobs_prv_to_cpu(const std::vector &data) { - for (size_t i = 0; i < data.size(); i++) { - std::shared_ptr mem_holder = data[i].Mkl_mem_; - if (mem_holder != nullptr && mem_holder->b_eager_mode) { - mem_holder->check_and_prv_to_cpu(data[i].dptr_); - } - } -} -inline void mkl_set_tblob_eager_mode(const TBlob &data) { - std::shared_ptr mem_holder = data.Mkl_mem_; - if (mem_holder != nullptr) { - mem_holder->set_eager_mode(true); - } -} -#endif -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h new file mode 100644 index 000000000000..f32b8d1ffe93 --- /dev/null +++ b/src/operator/nn/activation-inl.h @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file activation-inl.h + * \brief Activation operator + * \author Bing Xu, Da Zheng +*/ +#ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_ +#define MXNET_OPERATOR_NN_ACTIVATION_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" + +namespace mxnet { +namespace op { +// Declare enumeration of input order to make code more intuitive. +// // These enums are only visible within this header +namespace activation { +enum ActivationOpInputs {kData}; +enum ActivationOpOutputs {kOut}; +enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU}; +} // activation + +struct ActivationParam : public dmlc::Parameter { + // use int for enumeration + int act_type; + DMLC_DECLARE_PARAMETER(ActivationParam) { + DMLC_DECLARE_FIELD(act_type) + .add_enum("relu", activation::kReLU) + .add_enum("sigmoid", activation::kSigmoid) + .add_enum("tanh", activation::kTanh) + .add_enum("softrelu", activation::kSoftReLU) + .describe("Activation function to be applied."); + } +}; + +/** + * \brief This is the implementation of activation operator. + * \tparam xpu The device that the op will be executed on. + */ +template +class ActivationOp { + public: + virtual void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + Tensor data = in_data.FlatTo2D(s); + Tensor out = out_data.FlatTo2D(s); + Assign(out, req, F(data)); + } + + virtual void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, + const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + Tensor m_out_grad = out_grad.FlatTo2D(s); + Tensor m_out_data = out_data.FlatTo2D(s); + Tensor m_in_grad = in_grad.FlatTo2D(s); + Assign(m_in_grad, req, F(m_out_data) * m_out_grad); + } +}; // class ActivationOp + +template +ActivationOp &get_activation_op() { + static thread_local ActivationOp op; + return op; +} + +template +void _ActivationCompute(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &input, OpReqType req, const TBlob &output) { + MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, { + switch (param.act_type) { + case activation::kReLU: + get_activation_op().Forward( + ctx, input, req, output); + break; + case activation::kSigmoid: + get_activation_op().Forward( + ctx, input, req, output); + break; + case activation::kTanh: + get_activation_op().Forward( + ctx, input, req, output); + break; + case activation::kSoftReLU: + get_activation_op().Forward( + ctx, input, req, output); + break; + default: + LOG(FATAL) << "unknown activation type"; + } + }); +} + +template +void _ActivationGradCompute(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &out_grad, const TBlob &out_data, OpReqType req, + const TBlob &output) { + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + switch (param.act_type) { + case activation::kReLU: + get_activation_op().Backward( + ctx, out_grad, out_data, req, output); + break; + case activation::kSigmoid: + get_activation_op().Backward( + ctx, out_grad, out_data, req, output); + break; + case activation::kTanh: + get_activation_op().Backward( + ctx, out_grad, out_data, req, output); + break; + case activation::kSoftReLU: + get_activation_op().Backward( + ctx, out_grad, out_data, req, output); + break; + default: + LOG(FATAL) << "unknown activation type"; + } + }); +} + +template +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + _ActivationCompute(param, ctx, inputs[0], req[0], outputs[0]); +} + +template +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(inputs.size(), 3U); +#else + CHECK_EQ(inputs.size(), 2U); +#endif + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + _ActivationGradCompute(param, ctx, inputs[0], inputs[1], req[0], outputs[0]); +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_NN_ACTIVATION_INL_H_ diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc new file mode 100644 index 000000000000..c4c293290f0b --- /dev/null +++ b/src/operator/nn/activation.cc @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file activation.cc + * \brief activation op + * \author Bing Xu, Da Zheng +*/ +#include "./activation-inl.h" +#include "../tensor/elemwise_unary_op.h" +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_act-inl.h" +#endif // MXNET_USE_MKLDNN + +namespace mxnet { +namespace op { + +DMLC_REGISTER_PARAMETER(ActivationParam); + +// This will determine the order of the inputs for backward computation. +struct ActivationGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0}); +#if MXNET_USE_CUDNN == 1 + heads.push_back(n->inputs[activation::kData]); +#endif + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNAct_Forward(ctx, param, inputs[0], req[0], outputs[0]); + return; + default: + break; + } +#endif + _ActivationCompute(param, ctx, inputs[0].data(), req[0], + outputs[0].data()); +} + +void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(inputs.size(), 3U); +#else + CHECK_EQ(inputs.size(), 2U); +#endif + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNAct_Backward(ctx, param, inputs[0], inputs[1], req[0], + outputs[0]); + return; + default: + break; + } +#endif + _ActivationGradCompute(param, ctx, inputs[0].data(), inputs[1].data(), + req[0], outputs[0].data()); +} + +inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + return ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); +} + +inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(in_attrs->size(), 3U); +#else + CHECK_EQ(in_attrs->size(), 2U); +#endif + CHECK_EQ(out_attrs->size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif +#if MXNET_USE_CUDNN == 1 + return ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); +#else + return ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); +#endif +} + +MXNET_OPERATOR_REGISTER_UNARY(Activation) +.describe(R"code(Applies an activation function element-wise to the input. + +The following activation functions are supported: + +- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)` +- `sigmoid`: :math:`y = \frac{1}{1 + exp(-x)}` +- `tanh`: Hyperbolic tangent, :math:`y = \frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}` +- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` + +)code" ADD_FILELINE) +.set_attr_parser(ParamParser) +.set_attr("FInferStorageType", ActivationStorageType) +.set_attr("FCompute", ActivationCompute) +.set_attr("FComputeEx", ActivationComputeEx_CPU) +.set_attr("FGradient", ActivationGrad{"_backward_Activation"}) +.add_arguments(ActivationParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Activation) +.set_num_inputs(3) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_ActStorageType) +.set_attr("FInferShape", ElemwiseShape<3, 1>) +.set_attr("FInferType", ElemwiseType<3, 1>) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", ActivationGradCompute) +.set_attr("FComputeEx", ActivationGradComputeEx_CPU); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu new file mode 100644 index 000000000000..f3027b82bae4 --- /dev/null +++ b/src/operator/nn/activation.cu @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file activation.cu + * \brief + * \author Bing Xu +*/ +#include "./activation-inl.h" +#include "../mshadow_op.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn/cudnn_activation-inl.h" +#endif + +namespace mxnet { +namespace op { + +#if MXNET_USE_CUDNN == 1 + +template +static CuDNNActivationOp &get_cudnn_op(const ActivationParam& param) { + static thread_local CuDNNActivationOp cudnn_op; + cudnn_op.Init(param); + return cudnn_op; +} + +template<> +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + + // SoftReLU not supported by CUDNN yet + if (param.act_type == activation::kSoftReLU) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_activation_op().Forward(ctx, + inputs[0], req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_cudnn_op(param).Forward(ctx, inputs[0], req[0], outputs[0]); + }); + } +} + +template<> +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + + // SoftReLU not supported by CUDNN yet + if (param.act_type == activation::kSoftReLU) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_activation_op().Backward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_cudnn_op(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]); + }); + } +} +#endif + +NNVM_REGISTER_OP(Activation) +.set_attr("FCompute", ActivationCompute); + +NNVM_REGISTER_OP(_backward_Activation) +.set_attr("FCompute", ActivationGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h similarity index 70% rename from src/operator/batch_norm-inl.h rename to src/operator/nn/batch_norm-inl.h index 461f70272851..8e754ae431ee 100644 --- a/src/operator/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -22,8 +22,8 @@ * \brief * \author Bing Xu, Chris Olivier */ -#ifndef MXNET_OPERATOR_BATCH_NORM_INL_H_ -#define MXNET_OPERATOR_BATCH_NORM_INL_H_ +#ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ +#define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ #include #include @@ -33,9 +33,9 @@ #include #include #include -#include "./mshadow_op.h" -#include "./operator_common.h" -#include "mxnet_op.h" +#include "../mshadow_op.h" +#include "../operator_common.h" +#include "../mxnet_op.h" #ifdef __GNUG__ #pragma GCC diagnostic push @@ -46,7 +46,8 @@ namespace mxnet { namespace op { namespace batchnorm { -enum BatchNormOpInputs {kData, kGamma, kBeta}; // kGamma: weights, kBeta: biases +enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean, + kInMovingVar}; // kGamma: weights, kBeta: biases enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states @@ -86,9 +87,9 @@ struct BatchNormParam : public dmlc::Parameter { /*! \brief Batch normalization operator */ template -class BatchNormOp : public Operator { +class BatchNormOp { public: - explicit BatchNormOp(BatchNormParam param) { + void Init(BatchNormParam param) { this->param_ = param; } @@ -107,7 +108,7 @@ class BatchNormOp : public Operator { * need, epecial case like Batch Norm requires. * \sa OpReqType, OpContext */ - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -157,7 +158,7 @@ class BatchNormOp : public Operator { * \param aux_states Auxiliary states of operator. Normally operator doesn't need * \sa OperatorProperty, OpReqType, OpContext */ - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -211,151 +212,49 @@ class BatchNormOp : public Operator { BatchNormParam param_; }; // class BatchNormOp -template -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape); - -#if DMLC_USE_CXX11 -class BatchNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; - const TShape &dshape = in_shape->at(0); - - const size_t channelAxis = static_cast(param_.axis < 0 - ? static_cast(dshape.ndim()) + param_.axis - : param_.axis); - CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis; - - const int channelCount = dshape[channelAxis]; - - if (dshape.ndim() == 0) { - return false; - } - - in_shape->at(1) = TShape(Shape1(channelCount)); - in_shape->at(2) = TShape(Shape1(channelCount)); - - out_shape->clear(); - out_shape->push_back(dshape); // kOut - out_shape->push_back(Shape1(channelCount)); // kMean - out_shape->push_back(Shape1(channelCount)); // kVar - - aux_shape->clear(); - aux_shape->push_back(Shape1(channelCount)); // kMovingMean - aux_shape->push_back(Shape1(channelCount)); // kMovingVar - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - using namespace mshadow; - CHECK_GE(in_type->size(), 1U); - const int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - // For float16 input type beta, gamma, mean, and average are stored in float32. - // For other input types, these parameters have the same type as input - // NOTE: This requirement is from cuDNN (v. 4 and 5) - int dtype_param; - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { - dtype_param = mshadow::DataType::kFlag; }); - for (index_t i = 1; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype_param; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); - } - } - for (index_t i = 0; i < aux_type->size(); ++i) { - if ((*aux_type)[i] != -1) { - UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]); - } - } - const size_t n_aux = this->ListAuxiliaryStates().size(); - aux_type->clear(); - for (size_t i = 0; i < n_aux; ++i) { - aux_type->push_back(dtype_param); - } - const size_t n_out = this->ListOutputs().size(); - out_type->clear(); - out_type->push_back(dtype); - for (size_t i = 1; i < n_out; ++i) { - out_type->push_back(dtype_param); - } - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new BatchNormProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "BatchNorm"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[batchnorm::kOut], - out_data[batchnorm::kMean], - out_data[batchnorm::kVar], - in_data[batchnorm::kData], - in_data[batchnorm::kGamma] - }; - } - - int NumVisibleOutputs() const override { - if (param_.output_mean_var) { - return 3; - } - return 1; - } - - int NumOutputs() const override { - return 3; - } - - std::vector ListArguments() const override { - return {"data", "gamma", "beta"}; - } - - std::vector ListOutputs() const override { - return {"output", "mean", "var"}; - } - - std::vector ListAuxiliaryStates() const override { - return {"moving_mean", "moving_var"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } +template +static BatchNormOp &GetBatchNormOp(const BatchNormParam& param) { + static thread_local BatchNormOp op; + op.Init(param); + return op; +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void BatchNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Forward(ctx, in_data, + req, outputs, aux_states); + }); +} - inline const BatchNormParam& getParam() const { - return param_; - } +template +void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam& param = nnvm::get(attrs.parsed); + std::vector out_grad(inputs.begin(), + inputs.begin() + (param.output_mean_var ? 3U : 1U)); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + + MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Backward(ctx, out_grad, + in_data, out_data, req, in_grad, aux_states); + }); +} - private: - BatchNormParam param_; -}; // class BatchNormProp +#if DMLC_USE_CXX11 namespace batchnorm { @@ -488,5 +387,5 @@ extern volatile bool disable_mkl; #pragma GCC diagnostic pop #endif -#endif // MXNET_OPERATOR_BATCH_NORM_INL_H_ +#endif // MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ diff --git a/src/operator/batch_norm.cc b/src/operator/nn/batch_norm.cc similarity index 81% rename from src/operator/batch_norm.cc rename to src/operator/nn/batch_norm.cc index 866b7fe619cb..10e30c4be7f2 100644 --- a/src/operator/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -24,12 +24,13 @@ */ #include "batch_norm-inl.h" -#include #if MXNET_USE_MKL2017 == 1 #include #include "./mkl/mkl_memory-inl.h" #include "./mkl/mkl_batch_norm-inl.h" #endif // MXNET_USE_MKL2017 +#include +#include "../elemwise_op_common.h" /*! \brief inverse standard deviation <-> variance */ #define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) @@ -313,45 +314,76 @@ void BatchNormOp::DoBackward(mshadow::Stream *, } } -template<> -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { - param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); - Operator *op = nullptr; -#if MXNET_USE_MKL2017 == 1 - if (shape.ndim() == 4 - && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS - && !mxnet::op::batchnorm::disable_mkl) { - switch (dtype) { - case mshadow::kFloat32: - op = new MKLBatchNormOp(param); - break; - case mshadow::kFloat64: - op = new MKLBatchNormOp(param); - break; - default: - // MKL operator doesn't support half_t, so fall through - break; - } - } -#endif - if (!op) { - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, - DType, - AccReal, { - op = new BatchNormOp(param); }); +DMLC_REGISTER_PARAMETER(BatchNormParam); + +static bool BatchNormShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]"; + const TShape &dshape = in_shape->at(0); + + const size_t channelAxis = static_cast(param.axis < 0 + ? static_cast(dshape.ndim()) + param.axis + : param.axis); + CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis; + + const int channelCount = dshape[channelAxis]; + + if (dshape.ndim() == 0) { + return false; } - return op; + + in_shape->at(1) = TShape(Shape1(channelCount)); + in_shape->at(2) = TShape(Shape1(channelCount)); + in_shape->at(3) = TShape(Shape1(channelCount)); // kMovingMean + in_shape->at(4) = TShape(Shape1(channelCount)); // kMovingVar + + out_shape->clear(); + out_shape->push_back(dshape); // kOut + out_shape->push_back(Shape1(channelCount)); // kMean + out_shape->push_back(Shape1(channelCount)); // kVar + + return true; } -// DO_BIND_DISPATCH comes from operator_common.h -Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); +static inline std::vector ListArguments() { + return {"data", "gamma", "beta"}; } -DMLC_REGISTER_PARAMETER(BatchNormParam); +static inline std::vector ListOutputs() { + return {"output", "mean", "var"}; +} + +static bool BatchNormType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + using namespace mshadow; + CHECK_GE(in_type->size(), 1U); + const int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + // For float16 input type beta, gamma, mean, and average are stored in float32. + // For other input types, these parameters have the same type as input + // NOTE: This requirement is from cuDNN (v. 4 and 5) + int dtype_param; + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { + dtype_param = mshadow::DataType::kFlag; }); + for (index_t i = 1; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype_param; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); + } + } + const size_t n_out = ListOutputs().size(); + out_type->clear(); + out_type->push_back(dtype); + for (size_t i = 1; i < n_out; ++i) { + out_type->push_back(dtype_param); + } + return true; +} -MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp) +NNVM_REGISTER_OP(BatchNorm) .describe(R"code(Batch normalization. Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as @@ -397,14 +429,35 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr then set ``gamma`` to 1 and its gradient to 0. )code" ADD_FILELINE) +.set_num_inputs(5) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mean", "var"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + return param.output_mean_var ? 3 : 1; +}) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{3, 4}; +}) +.set_attr("FInferShape", BatchNormShape) +.set_attr("FInferType", BatchNormType) +.set_attr("FCompute", BatchNormCompute) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") .add_argument("gamma", "NDArray-or-Symbol", "gamma array") .add_argument("beta", "NDArray-or-Symbol", "beta array") .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") -.add_arguments(BatchNormParam::__FIELDS__()); - -NNVM_REGISTER_OP(BatchNorm) +.add_arguments(BatchNormParam::__FIELDS__()) .set_attr( "FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { @@ -416,5 +469,11 @@ NNVM_REGISTER_OP(BatchNorm) } }); +NNVM_REGISTER_OP(_backward_BatchNorm) +.set_num_outputs(5) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FCompute", BatchNormGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/batch_norm.cu b/src/operator/nn/batch_norm.cu similarity index 89% rename from src/operator/batch_norm.cu rename to src/operator/nn/batch_norm.cu index 9a8b576a16ee..65170bb7900c 100644 --- a/src/operator/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -35,10 +35,10 @@ #define USE_GLOBAL_STATS_FLAG 32 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 -#include "./cudnn_batch_norm-inl.h" +#include "./cudnn/cudnn_batch_norm-inl.h" #endif -#include "../common/cuda_utils.h" +#include "../../common/cuda_utils.h" using namespace mxnet; @@ -636,30 +636,86 @@ void BatchNormOp::DoBackward(mshadow::Stream *stream, MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu); } -/*! \brief Create GPU operator for batch normalization */ +template +static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { + static thread_local CuDNNBatchNormOp op; + op.Init(param); + return op; +} + template<> -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { +void BatchNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + BatchNormParam param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + int dtype = inputs[0].type_flag_; + TShape shape = inputs[0].shape_; + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); - Operator *op = NULL; #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNBatchNormOp(param); + GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - op = new BatchNormOp(param); + GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); }) } #else - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, - DType, - AccReal, - { op = new BatchNormOp(param); }); + MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); + }); #endif - return op; } +template<> +void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 11U); + BatchNormParam param = nnvm::get(attrs.parsed); + std::vector out_grad(1, inputs[0]); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + int dtype = inputs[0].type_flag_; + TShape shape = inputs[0].shape_; + + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 + && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, + req, in_grad, aux_states); + }) + } else { + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { + GetBatchNormOp(param).Backward(ctx, out_grad, + in_data, out_data, req, in_grad, aux_states); + }) + } +#else + MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Backward(ctx, out_grad, + in_data, out_data, req, in_grad, aux_states); + }); +#endif +} + +NNVM_REGISTER_OP(BatchNorm) +.set_attr("FCompute", BatchNormCompute); + +NNVM_REGISTER_OP(_backward_BatchNorm) +.set_attr("FCompute", BatchNormGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/convolution-inl.h b/src/operator/nn/convolution-inl.h similarity index 53% rename from src/operator/convolution-inl.h rename to src/operator/nn/convolution-inl.h index 5843293a362b..fd4030104c87 100644 --- a/src/operator/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -21,10 +21,10 @@ * \file convolution-inl.h * \brief * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ -#ifndef MXNET_OPERATOR_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_CONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ #include #include @@ -38,9 +38,9 @@ #include #include #include -#include "./operator_common.h" -#include "./nn/im2col.h" -#include "./linalg.h" +#include "../operator_common.h" +#include "./im2col.h" +#include "../linalg.h" namespace mxnet { @@ -147,9 +147,9 @@ namespace mxnet { namespace op { template -class ConvolutionOp : public Operator { +class ConvolutionOp { public: - explicit ConvolutionOp(ConvolutionParam p) { + void Init(ConvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(DType); @@ -159,11 +159,10 @@ class ConvolutionOp : public Operator { << "Only support NCW, NCHW and NCDHW layout"; } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(req[conv::kOut], kWriteTo); @@ -232,18 +231,19 @@ class ConvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector& out_grad, const std::vector& in_data, - const std::vector& out_data, const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { + const std::vector& in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); + // We expect 2 inputs: in data and weight. We don't need bias for + // computing gradient. size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_); @@ -385,299 +385,35 @@ class ConvolutionOp : public Operator { }; // class ConvolutionOp template -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class ConvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - } - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; - out_shape->resize(1, TShape()); - const TShape &dshp = (*in_shape)[conv::kData]; - if (dshp.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<3> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshp.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, - dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<4> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshp.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - CHECK_EQ(dshape[1] % param_.num_group, 0U) - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d convolution"; - Shape<5> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; - oshape[4] = dshape[4] ? - (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; - } - if (oshape[4] && param_.stride[2] == 1) { - dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCDHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - if (dshape[4] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; - } - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ConvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Convolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +void ConvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { + static thread_local ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - private: - // Adds symmetric padding to a data input (in one dimension) - index_t AddPad(index_t dsize, index_t pad) const { - return dsize + 2 * pad; - } +template +void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + static thread_local ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }); +} - ConvolutionParam param_; -}; // class ConvolutionProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc new file mode 100644 index 000000000000..160cb8eef6bf --- /dev/null +++ b/src/operator/nn/convolution.cc @@ -0,0 +1,512 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file convolution.cc + * \brief + * \author Bing Xu, Jun Wu, Da Zheng +*/ + +#include "./convolution-inl.h" +#include "../elemwise_op_common.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#if MXNET_USE_NNPACK == 1 +#include "./nnpack/nnpack_convolution-inl.h" +#endif // MXNET_USE_NNPACK + +namespace mxnet { +namespace op { +DMLC_REGISTER_PARAMETER(ConvolutionParam); + +static inline index_t AddPad(index_t dsize, index_t pad) { + return dsize + 2 * pad; +} + +static inline std::vector ListArguments(const ConvolutionParam& param_) { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } +} + +static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNConvolution_Forward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ConvolutionCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNConvolution_Backward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ConvolutionGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + using namespace mshadow; + const ConvolutionParam& param_ = nnvm::get(attrs.parsed); + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + if (dshp.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<3> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, + dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshp.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + CHECK_EQ(dshape[1] % param_.num_group, 0U) + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d convolution"; + Shape<5> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; + oshape[4] = dshape[4] ? + (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; + } + if (oshape[4] && param_.stride[2] == 1) { + dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCDHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + if (dshape[4] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; + } + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; + } +} + +static bool ConvolutionType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const ConvolutionParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; +} + +inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_ConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 3 : 4; + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), out_expected); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + ConvolutionParam param_; + try { + param_.Init(attrs->dict); + } catch (const dmlc::ParamError& e) { + std::ostringstream os; + os << e.what(); + os << ", in operator " << attrs->op->name << "(" + << "name=\"" << attrs->name << "\""; + for (const auto& k : attrs->dict) { + os << ", " << k.first << "=\"" << k.second << "\""; + } + os << ")"; + throw dmlc::ParamError(os.str()); + } + + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + attrs->parsed = std::move(param_); +} + +struct ConvolutionGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const ConvolutionParam& param = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[conv::kData]); + heads.push_back(n->inputs[conv::kWeight]); + if (!param.no_bias) + heads.push_back(n->inputs[conv::kBias]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +NNVM_REGISTER_OP(Convolution) +.describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. + +In the 2-D convolution, given input data with shape *(batch_size, +channel, height, width)*, the output is computed by + +.. math:: + + out[n,i,:,:] = bias[i] + \sum_{j=0}^{channel} data[n,j,:,:] \star + weight[i,j,:,:] + +where :math:`\star` is the 2-D cross-correlation operator. + +For general 2-D convolution, the shapes are + +- **data**: *(batch_size, channel, height, width)* +- **weight**: *(num_filter, channel, kernel[0], kernel[1])* +- **bias**: *(num_filter,)* +- **out**: *(batch_size, num_filter, out_height, out_width)*. + +Define:: + + f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1 + +then we have:: + + out_height=f(height, kernel[0], pad[0], stride[0], dilate[0]) + out_width=f(width, kernel[1], pad[1], stride[1], dilate[1]) + +If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + +The default data ``layout`` is *NCHW*, namely *(batch_size, channel, height, +width)*. We can choose other layouts such as *NHWC*. + +If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` +evenly into *g* parts along the channel axis, and also evenly split ``weight`` +along the first dimension. Next compute the convolution on the *i*-th part of +the data with the *i*-th weight part. The output is obtained by concatenating all +the *g* results. + +1-D convolution does not have *height* dimension but only *width* in space. + +- **data**: *(batch_size, channel, width)* +- **weight**: *(num_filter, channel, kernel[0])* +- **bias**: *(num_filter,)* +- **out**: *(batch_size, num_filter, out_width)*. + +3-D convolution adds an additional *depth* dimension besides *height* and +*width*. The shapes are + +- **data**: *(batch_size, channel, depth, height, width)* +- **weight**: *(num_filter, channel, kernel[0], kernel[1], kernel[2])* +- **bias**: *(num_filter,)* +- **out**: *(batch_size, num_filter, out_depth, out_height, out_width)*. + +Both ``weight`` and ``bias`` are learnable parameters. + +There are other options to tune the performance. + +- **cudnn_tune**: enable this option leads to higher startup time but may give + faster speed. Options are + + - **off**: no tuning + - **limited_workspace**:run test and pick the fastest algorithm that doesn't + exceed workspace limit. + - **fastest**: pick the fastest algorithm and ignore workspace limit. + - **None** (default): the behavior is determined by environment variable + ``MXNET_CUDNN_AUTOTUNE_DEFAULT``. 0 for off, 1 for limited workspace + (default), 2 for fastest. + +- **workspace**: A large number leads to more (GPU) memory usage but may improve + the performance. + +)code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(ConvolutionParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + if (params.no_bias) + return std::vector{"data", "weight"}; + else + return std::vector{"data", "weight", "bias"}; +}) +.set_attr("FInferShape", ConvolutionShape) +.set_attr("FInferType", ConvolutionType) +.set_attr("FInferStorageType", ConvStorageType) +.set_attr("FCompute", ConvolutionCompute) +.set_attr("FComputeEx", ConvolutionCompute_CPU) +.set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.") +.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") +.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") +.add_arguments(ConvolutionParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Convolution) +.set_num_inputs(3) +.set_num_outputs([](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_ConvStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(ConvolutionParamParser) +.set_attr("FCompute", ConvolutionGradCompute) +.set_attr("FComputeEx", ConvolutionGradCompute_CPU); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu new file mode 100644 index 000000000000..c0b3ca586f5f --- /dev/null +++ b/src/operator/nn/convolution.cu @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file convolution.cu + * \brief + * \author Bing Xu, Jun Wu, Da Zheng +*/ + +#include "./convolution-inl.h" +#include +#include "./depthwise_convolution-inl.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn/cudnn_convolution-inl.h" +#endif // MXNET_USE_CUDNN + +namespace mxnet { +namespace op { + +// This is to maintain one copy for each type. +template +static ConvolutionOp &get_op(const ConvolutionParam& param) { + static thread_local ConvolutionOp op; + op.Init(param); + return op; +} + +template +static CuDNNConvolutionOp &get_cudnn_op(const ConvolutionParam& param, + int forward_compute_type, int backward_compute_type, + const std::vector& in_shape, const std::vector& out_shape, + const Context& ctx) { + static thread_local CuDNNConvolutionOp op; + op.Init(param, forward_compute_type, backward_compute_type, + in_shape, out_shape, ctx); + return op; +} + +template<> +void ConvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + int dtype = inputs[conv::kData].type_flag_; + + // If 1D convolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + }) + return; + } else if (param.num_filter == param.num_group && + param.layout.value() == mshadow::kNCHW && + param.num_filter == inputs[conv::kData].shape_[1] && + param.kernel.ndim() == 2 && + param.dilate == mshadow::Shape2(1, 1) && + dtype == mshadow::kFloat32) { + static thread_local DepthwiseConvolutionOp op; + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = inputs[i].shape_; + op.Init(param, in_shape, out_shape); + op.Forward(ctx, inputs, req, outputs); + return; + } + +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + } else if (!CuDNNConvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + } else { + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = inputs[i].shape_; + CuDNNConvolutionOp &op = get_cudnn_op(param, + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); + op.Forward(ctx, inputs, req, outputs); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + }) +#endif // MXNET_USE_CUDNN +} + +template<> +void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + int dtype = out_grad.type_flag_; + + // If 1D convolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }) + return; + } else if (param.num_filter == param.num_group && + param.layout.value() == mshadow::kNCHW && + param.num_filter == in_data[conv::kData].shape_[1] && + param.kernel.ndim() == 2 && + param.dilate == mshadow::Shape2(1, 1) && + dtype == mshadow::kFloat32) { + static thread_local DepthwiseConvolutionOp op; + // The first element stores out grad. + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = in_data[i].shape_; + op.Init(param, in_shape, out_shape); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + return; + } + +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else if (!CuDNNConvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else { + // The first element stores out grad. + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = in_data[i].shape_; + CuDNNConvolutionOp &op = get_cudnn_op(param, + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }) +#endif // MXNET_USE_CUDNN +} + +NNVM_REGISTER_OP(Convolution) +.set_attr("FCompute", ConvolutionCompute); + +NNVM_REGISTER_OP(_backward_Convolution) +.set_attr("FCompute", ConvolutionGradCompute); + +} // namespace op +} // namespace mxnet + diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h similarity index 58% rename from src/operator/cudnn_activation-inl.h rename to src/operator/nn/cudnn/cudnn_activation-inl.h index 317ef47c126a..e513f57c8642 100644 --- a/src/operator/cudnn_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_activation-inl.h @@ -23,21 +23,27 @@ * \author Bing Xu */ -#ifndef MXNET_OPERATOR_CUDNN_ACTIVATION_INL_H_ -#define MXNET_OPERATOR_CUDNN_ACTIVATION_INL_H_ +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_ACTIVATION_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_ACTIVATION_INL_H_ #include #include -#include "./activation-inl.h" +#include "../activation-inl.h" namespace mxnet { namespace op { template -class CuDNNActivationOp : public Operator { +class CuDNNActivationOp { public: - explicit CuDNNActivationOp(ActivationParam param) { - param_ = param; - init_cudnn_ = false; + CuDNNActivationOp() { dtype_ = mshadow::DataType::kCudnnFlag; + #if CUDNN_MAJOR >= 5 + nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; + CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); + #endif + } + + void Init(const ActivationParam ¶m) { + param_ = param; switch (param_.act_type) { case activation::kReLU: mode_ = CUDNN_ACTIVATION_RELU; @@ -53,67 +59,55 @@ class CuDNNActivationOp : public Operator { break; } #if CUDNN_MAJOR >= 5 - nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; - CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_)); #endif + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); } ~CuDNNActivationOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); - #endif - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); + #endif } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; - if (in_data[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], 1, 1); - data = in_data[activation::kData].get_with_shape(dshape, s); - out = out_data[activation::kOut].get_with_shape(dshape, s); + if (in_data.ndim() == 2) { + Shape<4> dshape = Shape4(in_data.shape_[0], + in_data.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_data[activation::kData].Size(); + index_t size_left = in_data.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data[activation::kData].ndim()) { - dshape[i] = in_data[activation::kData].shape_[i]; + if (i < in_data.ndim()) { + dshape[i] = in_data.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[activation::kData].get_with_shape(dshape, s); - out = out_data[activation::kOut].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); } typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - if (!init_cudnn_) { - init_cudnn_ = true; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - } + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationForward(s->dnn_handle_, mode_, @@ -135,20 +129,11 @@ class CuDNNActivationOp : public Operator { #endif } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &in_data, const TBlob &out_data, + const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; Stream *s = ctx.get_stream(); @@ -156,31 +141,38 @@ class CuDNNActivationOp : public Operator { Tensor data; Tensor output_data; Tensor input_grad; - if (in_grad[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0], - in_grad[activation::kData].shape_[1], 1, 1); - data = in_data[activation::kData].get_with_shape(dshape, s); - grad = out_grad[activation::kOut].get_with_shape(dshape, s); - output_data = out_data[activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[activation::kData].get_with_shape(dshape, s); + if (in_grad.ndim() == 2) { + Shape<4> dshape = Shape4(in_grad.shape_[0], + in_grad.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_grad[activation::kData].Size(); + index_t size_left = in_grad.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad[activation::kData].ndim()) { - dshape[i] = in_grad[activation::kData].shape_[i]; + if (i < in_grad.ndim()) { + dshape[i] = in_grad.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[activation::kData].get_with_shape(dshape, s); - output_data = out_data[activation::kOut].get_with_shape(dshape, s); - grad = out_grad[activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[activation::kData].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_, mode_, @@ -211,7 +203,6 @@ class CuDNNActivationOp : public Operator { } private: - bool init_cudnn_; cudnnDataType_t dtype_; cudnnActivationMode_t mode_; cudnnTensorDescriptor_t shape_desc_; @@ -224,4 +215,4 @@ class CuDNNActivationOp : public Operator { }; // class CuDNNActivationOp } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_ACTIVATION_INL_H_ +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_ACTIVATION_INL_H_ diff --git a/src/operator/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h similarity index 95% rename from src/operator/cudnn_algoreg-inl.h rename to src/operator/nn/cudnn/cudnn_algoreg-inl.h index c10593fb0af4..ff94f618ce8b 100644 --- a/src/operator/cudnn_algoreg-inl.h +++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h @@ -22,16 +22,16 @@ * \brief * \author Bing Xu */ -#ifndef MXNET_OPERATOR_CUDNN_ALGOREG_INL_H_ -#define MXNET_OPERATOR_CUDNN_ALGOREG_INL_H_ +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_ALGOREG_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_ALGOREG_INL_H_ #include #include #include #include -#include "../common/cuda_utils.h" -#include "./convolution-inl.h" -#include "./deconvolution-inl.h" +#include "../../../common/cuda_utils.h" +#include "../convolution-inl.h" +#include "../deconvolution-inl.h" namespace mxnet { namespace op { #if MXNET_USE_CUDNN == 1 @@ -174,4 +174,4 @@ typedef CuDNNAlgoReg CuDNNDeconvAlgoReg; } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_ALGOREG_INL_H_ +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_ALGOREG_INL_H_ diff --git a/src/operator/cudnn_algoreg.cc b/src/operator/nn/cudnn/cudnn_algoreg.cc similarity index 100% rename from src/operator/cudnn_algoreg.cc rename to src/operator/nn/cudnn/cudnn_algoreg.cc diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h similarity index 75% rename from src/operator/cudnn_batch_norm-inl.h rename to src/operator/nn/cudnn/cudnn_batch_norm-inl.h index bd3c2d6a1c3a..b0e35d932a89 100644 --- a/src/operator/cudnn_batch_norm-inl.h +++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h @@ -23,13 +23,13 @@ * \author Junyuan Xie */ -#ifndef MXNET_OPERATOR_CUDNN_BATCH_NORM_INL_H_ -#define MXNET_OPERATOR_CUDNN_BATCH_NORM_INL_H_ +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_BATCH_NORM_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_BATCH_NORM_INL_H_ #include #include #include #include -#include "batch_norm-inl.h" +#include "../batch_norm-inl.h" namespace mxnet { namespace op { @@ -42,28 +42,30 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar}; #if defined(__CUDACC__) template -class CuDNNBatchNormOp : public Operator { +class CuDNNBatchNormOp { public: - explicit CuDNNBatchNormOp(BatchNormParam param) { + CuDNNBatchNormOp() { using namespace mshadow; - CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) - << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; - this->param_ = param; - init_cudnn_ = false; dtype_ = DataType::kCudnnFlag; // For float16 input type beta, gamma, mean, and average are stored in float32. // For other input types, these parameters have the same type as input dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType::kFlag; + CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); + } + + void Init(const BatchNormParam ¶m) { + CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) + << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; + this->param_ = param; } ~CuDNNBatchNormOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -83,29 +85,7 @@ class CuDNNBatchNormOp : public Operator { CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2); CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4); - if (!init_cudnn_) { - for (int i = 0; i < 4; ++i) { - if (i < in_data[cudnnbatchnorm::kData].ndim()) { - shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i]; - } else { - shape_[i] = 1; - } - } - CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - shape_[0], - shape_[1], - shape_[2], - shape_[3])); - CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, - io_desc_, - CUDNN_BATCHNORM_SPATIAL)); - init_cudnn_ = true; - } - + Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -176,7 +156,7 @@ class CuDNNBatchNormOp : public Operator { }) } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -192,6 +172,7 @@ class CuDNNBatchNormOp : public Operator { CHECK(ctx.is_train && !param_.use_global_stats) << "use global statistics is not yet supported in CuDNNBatchNorm"; + Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -289,7 +270,27 @@ class CuDNNBatchNormOp : public Operator { } private: - bool init_cudnn_; + void Init(const TBlob &in_data) { + for (int i = 0; i < 4; ++i) { + if (i < in_data.ndim()) { + shape_[i] = in_data.shape_[i]; + } else { + shape_[i] = 1; + } + } + + CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + shape_[0], + shape_[1], + shape_[2], + shape_[3])); + CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, + io_desc_, + CUDNN_BATCHNORM_SPATIAL)); + } + cudnnDataType_t dtype_; int dtype_param_; cudnnTensorDescriptor_t io_desc_, mean_desc_; @@ -298,92 +299,7 @@ class CuDNNBatchNormOp : public Operator { }; #endif // defined(__CUDACC__) -template -Operator *CreateOp_CuDNNv4(BatchNormParam param); - - -#if DMLC_USE_CXX11 -class CuDNNBatchNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - in_shape->at(1) = TShape(Shape1(dshape[1])); - in_shape->at(2) = TShape(Shape1(dshape[1])); - - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(Shape1(dshape[1])); - out_shape->push_back(Shape1(dshape[1])); - - aux_shape->clear(); - aux_shape->push_back(Shape1(dshape[1])); - aux_shape->push_back(Shape1(dshape[1])); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new CuDNNBatchNormProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "CuDNNBatchNorm"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[cudnnbatchnorm::kOut], - out_data[cudnnbatchnorm::kMean], - out_data[cudnnbatchnorm::kInvVar], - in_data[cudnnbatchnorm::kData], - in_data[cudnnbatchnorm::kGamma] - }; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 3; - } - - std::vector ListArguments() const override { - return {"data", "gamma", "beta"}; - } - - std::vector ListOutputs() const override { - return {"output", "mean", "inv_var"}; - } - - std::vector ListAuxiliaryStates() const override { - return {"moving_mean", "moving_inv_var"}; - } - - Operator* CreateOperator(Context ctx) const override; - - private: - BatchNormParam param_; -}; // class CuDNNBatchNormProp - -#endif // DMLC_USE_CXX11 #endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_BATCH_NORM_INL_H_ +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_BATCH_NORM_INL_H_ diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc new file mode 100644 index 000000000000..4bf6b4a2422d --- /dev/null +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cudnn_batch_norm.cc + * \brief + * \author Junyuan Xie +*/ + +#include "./cudnn_batch_norm-inl.h" +#include +#include "../../elemwise_op_common.h" + +namespace mxnet { +namespace op { +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 + +static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + in_shape->at(1) = TShape(Shape1(dshape[1])); + in_shape->at(2) = TShape(Shape1(dshape[1])); + in_shape->at(3) = TShape(Shape1(dshape[1])); + in_shape->at(4) = TShape(Shape1(dshape[1])); + + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(Shape1(dshape[1])); + out_shape->push_back(Shape1(dshape[1])); + + return true; +} + +static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; +} + +static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; +} + +NNVM_REGISTER_OP(CuDNNBatchNorm) +.describe("Apply batch normalization to input.") +.set_num_inputs(5) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mean", "var"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + return 1; +}) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{3, 4}; +}) +.set_attr("FInferShape", BatchNormShape) +.set_attr("FCompute", BatchNormCompute_CPU) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"}) +.add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") +.add_argument("gamma", "NDArray-or-Symbol", "gamma array") +.add_argument("beta", "NDArray-or-Symbol", "beta array") +.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") +.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") +.add_arguments(BatchNormParam::__FIELDS__()) +.set_attr( + "FSetInputVarAttrOnCompose", + [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { + if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; + if (index == 3) { + var->attrs.dict["__init__"] = "[\"zero\", {}]"; + } else if (index == 4) { + var->attrs.dict["__init__"] = "[\"one\", {}]"; + } + }); + +NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) +.set_num_outputs(5) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{6, 7}; +}) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FCompute", BatchNormGradCompute_CPU); + +#endif // CUDNN_MAJOR >= 4 + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu new file mode 100644 index 000000000000..bdde6ecbc69c --- /dev/null +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cudnn_batch_norm.cu + * \brief + * \author Junyuan Xie +*/ + +#include "./cudnn_batch_norm-inl.h" +#include + +namespace mxnet { +namespace op { +#if CUDNN_MAJOR == 4 + +template +static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { + static thread_local CuDNNBatchNormOp op; + op.Init(param); + return op; +} + +static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; +#else + const BatchNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); +#endif +} + +static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; +#else + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam& param = nnvm::get(attrs.parsed); + std::vector out_grad(1, inputs[0]); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, + req, in_grad, aux_states); +#endif +} + +NNVM_REGISTER_OP(CuDNNBatchNorm) +.set_attr("FCompute", BatchNormCompute_CuDNNv4); + +NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) +.set_attr("FCompute", BatchNormGradCompute_CuDNNv4); + +#endif // CUDNN_MAJOR == 4 +} // namespace op +} // namespace mxnet + diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h similarity index 96% rename from src/operator/cudnn_convolution-inl.h rename to src/operator/nn/cudnn/cudnn_convolution-inl.h index b2b59944e895..8852c4cfccc9 100644 --- a/src/operator/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -22,16 +22,16 @@ * \brief * \author Bing Xu */ -#ifndef MXNET_OPERATOR_CUDNN_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_CUDNN_CONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_ #include #include #include #include -#include "./convolution-inl.h" +#include "../convolution-inl.h" #include "./cudnn_algoreg-inl.h" -#include "../common/cuda_utils.h" +#include "../../../common/cuda_utils.h" namespace mxnet { namespace op { @@ -41,9 +41,19 @@ namespace op { * \brief The Operator used to perform convolution using cuDNN kernels. */ template -class CuDNNConvolutionOp : public Operator { +class CuDNNConvolutionOp { public: - explicit CuDNNConvolutionOp(const ConvolutionParam& param, + CuDNNConvolutionOp() { + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + } + + void Init(const ConvolutionParam& param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -56,8 +66,6 @@ class CuDNNConvolutionOp : public Operator { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); - init_cudnn_ = false; - init_temp_size_ = false; dtype_ = DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -91,22 +99,19 @@ class CuDNNConvolutionOp : public Operator { } ~CuDNNConvolutionOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; DType *data_ptr = NULL; @@ -182,13 +187,11 @@ class CuDNNConvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; @@ -198,7 +201,8 @@ class CuDNNConvolutionOp : public Operator { DType *data_ptr = NULL; DType *gdata_ptr = NULL; CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); if (param_.kernel.ndim() == 2) { Tensor grad = out_grad[conv::kOut].get(s); @@ -223,6 +227,7 @@ class CuDNNConvolutionOp : public Operator { data_ptr = data.dptr_; gdata_ptr = gdata.dptr_; } + GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { @@ -360,13 +365,6 @@ class CuDNNConvolutionOp : public Operator { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[conv::kData]; TShape wshape = in_shape[conv::kWeight]; @@ -572,7 +570,6 @@ class CuDNNConvolutionOp : public Operator { &bias_shape[0], &bias_stride[0])); } - init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -815,7 +812,6 @@ class CuDNNConvolutionOp : public Operator { } void GetTempSize(const OpContext& ctx) { - if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_size = 0, back_size_w = 0; CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_, @@ -840,8 +836,6 @@ class CuDNNConvolutionOp : public Operator { out_desc_, forward_algo_.AlgoNumber(), &forward_workspace_byte_)); - - init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -874,8 +868,6 @@ class CuDNNConvolutionOp : public Operator { std::vector param_dilate_; std::vector param_pad_; - bool init_cudnn_; - bool init_temp_size_; // Temp workspace size in bytes needed for Forward() operation. size_t forward_workspace_byte_; // Temp workspace size in bytes needed for Backward() operation. @@ -910,4 +902,4 @@ class CuDNNConvolutionOp : public Operator { } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_CONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_CONVOLUTION_INL_H_ diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h similarity index 96% rename from src/operator/cudnn_deconvolution-inl.h rename to src/operator/nn/cudnn/cudnn_deconvolution-inl.h index 5e9b7c5704d0..0badd99817e5 100644 --- a/src/operator/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -22,25 +22,35 @@ * \brief * \author Wei Wu, Leonard Lausen */ -#ifndef MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_ -#define MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_ #include #include #include #include -#include "./deconvolution-inl.h" +#include "../deconvolution-inl.h" #include "./cudnn_algoreg-inl.h" -#include "../common/cuda_utils.h" +#include "../../../common/cuda_utils.h" namespace mxnet { namespace op { #if MXNET_USE_CUDNN == 1 template -class CuDNNDeconvolutionOp : public Operator { +class CuDNNDeconvolutionOp { public: - explicit CuDNNDeconvolutionOp(DeconvolutionParam param, + CuDNNDeconvolutionOp() { + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + } + + void Init(DeconvolutionParam param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -53,8 +63,6 @@ class CuDNNDeconvolutionOp : public Operator { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); - init_cudnn_ = false; - init_temp_size_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -88,22 +96,19 @@ class CuDNNDeconvolutionOp : public Operator { } ~CuDNNDeconvolutionOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; DType *data_ptr = NULL; @@ -196,13 +201,11 @@ class CuDNNDeconvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; @@ -212,7 +215,8 @@ class CuDNNDeconvolutionOp : public Operator { DType *data_ptr = NULL; DType *gdata_ptr = NULL; CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); if (param_.kernel.ndim() == 2) { Tensor grad = out_grad[deconv::kOut].get(s); @@ -379,13 +383,6 @@ class CuDNNDeconvolutionOp : public Operator { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[deconv::kData]; TShape wshape = in_shape[deconv::kWeight]; @@ -590,7 +587,6 @@ class CuDNNDeconvolutionOp : public Operator { &bias_shape[0], &bias_stride[0])); } - init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -842,7 +838,6 @@ class CuDNNDeconvolutionOp : public Operator { } void GetTempSize(const OpContext& ctx) { - if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_data_algo_workspace_size = 0; size_t back_filter_algo_workspace_size = 0; @@ -872,7 +867,6 @@ class CuDNNDeconvolutionOp : public Operator { forward_workspace_byte_ = back_data_algo_workspace_size; backward_workspace_byte_ = std::max(forward_algo_workspace_size, back_filter_algo_workspace_size); - init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -903,8 +897,11 @@ class CuDNNDeconvolutionOp : public Operator { std::vector param_stride_; std::vector param_dilate_; - bool init_cudnn_; - bool init_temp_size_; + int forward_compute_type_; + int backward_compute_type_; + const std::vector in_shapes_; + const std::vector out_shapes_; + // Temp workspace size in bytes needed for Forward() operation. Note that // in deconvolution, this is handled by the cuDNN backprop-to-data kernel. size_t forward_workspace_byte_; @@ -949,4 +946,4 @@ class CuDNNDeconvolutionOp : public Operator { } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_DECONVOLUTION_INL_H_ diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h new file mode 100644 index 000000000000..b31e45f26683 --- /dev/null +++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cudnn_pooling-inl.h + * \brief + * \author Bing Xu +*/ + +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_POOLING_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_POOLING_INL_H_ +#include +#include +#include "../pooling-inl.h" + +namespace mxnet { +namespace op { + +template +class CuDNNPoolingOp { + public: + CuDNNPoolingOp() { + // TODO(xxx): fp16 + dtype_ = mshadow::DataType::kCudnnFlag; + CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + } + + void Init(const PoolingParam &p) { + param_ = p; + switch (param_.pool_type) { + case pool_enum::kMaxPooling: + mode_ = CUDNN_POOLING_MAX; + break; + case pool_enum::kAvgPooling: + mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + default: + LOG(FATAL) << "Not implmented"; + } + } + + ~CuDNNPoolingOp() { + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); + } + + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); + typename DataType::ScaleType alpha = 1.0f; + typename DataType::ScaleType beta = 0.0f; + this->Init(s, in_data, out_data); + if (param_.kernel.ndim() == 2) { + // 2d pool + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, + pooling_desc_, + &alpha, + in_desc_, + data.dptr_, + &beta, + out_desc_, + out.dptr_)); + } else if (param_.kernel.ndim() == 3) { + // 3d pool + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, + pooling_desc_, + &alpha, + in_desc_, + data.dptr_, + &beta, + out_desc_, + out.dptr_)); + } else { + LOG(FATAL) << "Only support 2D or 3D pooling"; + } + } + + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &in_data, const TBlob &out_data, + const OpReqType &req, const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + + Stream *s = ctx.get_stream(); + CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); + typename DataType::ScaleType alpha = 1.0f; + typename DataType::ScaleType beta = 0.0f; + this->Init(s, in_data, out_data); + if (param_.kernel.ndim() == 2) { + // 2d pool + Tensor m_out_grad = out_grad.get(s); + Tensor m_in_data = in_data.get(s); + Tensor m_out_data = out_data.get(s); + Tensor m_in_grad = in_grad.get(s); + CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, + pooling_desc_, + &alpha, + out_desc_, + m_out_data.dptr_, + out_desc_, + m_out_grad.dptr_, + in_desc_, + m_in_data.dptr_, + &beta, + in_desc_, + m_in_grad.dptr_)); + } else if (param_.kernel.ndim() == 3) { + // 3d pool + Tensor m_out_grad = out_grad.get(s); + Tensor m_in_data = in_data.get(s); + Tensor m_out_data = out_data.get(s); + Tensor m_in_grad = in_grad.get(s); + CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, + pooling_desc_, + &alpha, + out_desc_, + m_out_data.dptr_, + out_desc_, + m_out_grad.dptr_, + in_desc_, + m_in_data.dptr_, + &beta, + in_desc_, + m_in_grad.dptr_)); + } else { + LOG(FATAL) << "Only support 2D or 3D pooling"; + } + } + + private: + inline void Init(mshadow::Stream *s, const TBlob &in_data, + const TBlob &out_data) { + using namespace mshadow; + #if CUDNN_MAJOR >= 5 + nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; + #endif + if (param_.kernel.ndim() == 2) { + // 2d conv + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + mshadow::Shape<4> dshape = data.shape_; + CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); + CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + out.shape_[0], + out.shape_[1], + out.shape_[2], + out.shape_[3])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + nan_prop_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 :param_.stride[1])); + #else + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + #endif + } else { + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + std::vector ishape = {static_cast(data.shape_[0]), + static_cast(data.shape_[1]), + static_cast(data.shape_[2]), + static_cast(data.shape_[3]), + static_cast(data.shape_[4])}; + + std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[3] * ishape[4]), + static_cast(ishape[4]), 1}; + + std::vector oshape = {static_cast(out.shape_[0]), + static_cast(out.shape_[1]), + static_cast(out.shape_[2]), + static_cast(out.shape_[3]), + static_cast(out.shape_[4])}; + + std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[3] * oshape[4]), + static_cast(oshape[4]), 1}; + + std::vector kernel_vec = {param_.global_pool ? ishape[2] : + static_cast(param_.kernel[0]), + param_.global_pool ? ishape[3] : + static_cast(param_.kernel[1]), + param_.global_pool ? ishape[4] : + static_cast(param_.kernel[2])}; + + std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), + param_.global_pool ? 0 : static_cast(param_.pad[1]), + param_.global_pool ? 0 : static_cast(param_.pad[2])}; + + std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), + param_.global_pool ? 1 : static_cast(param_.stride[1]), + param_.global_pool ? 1 : static_cast(param_.stride[2])}; + + CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, + dtype_, + static_cast(ishape.size()), + &ishape[0], + &istride[0])); + CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, + dtype_, + static_cast(oshape.size()), + &oshape[0], + &ostride[0])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, + mode_, + nan_prop_, + static_cast(kernel_vec.size()), + &(kernel_vec[0]), + &(pad_vec[0]), + &(stride_vec[0]))); + #else + LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; + #endif + } + } + + cudnnDataType_t dtype_; + cudnnHandle_t handle_; + cudnnPoolingMode_t mode_; + cudnnTensorDescriptor_t in_desc_; + cudnnTensorDescriptor_t out_desc_; + cudnnPoolingDescriptor_t pooling_desc_; + #if CUDNN_MAJOR >= 5 + cudnnNanPropagation_t nan_prop_; + #endif + PoolingParam param_; +}; // class CuDNNPoolingOp +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_POOLING_INL_H_ + diff --git a/src/operator/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h similarity index 50% rename from src/operator/cudnn_softmax_activation-inl.h rename to src/operator/nn/cudnn/cudnn_softmax_activation-inl.h index c604a8f3f4c1..9dac3bcebbbd 100644 --- a/src/operator/cudnn_softmax_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h @@ -23,81 +23,72 @@ * \author Bing Xu */ -#ifndef MXNET_OPERATOR_CUDNN_SOFTMAX_ACTIVATION_INL_H_ -#define MXNET_OPERATOR_CUDNN_SOFTMAX_ACTIVATION_INL_H_ +#ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_SOFTMAX_ACTIVATION_INL_H_ +#define MXNET_OPERATOR_NN_CUDNN_CUDNN_SOFTMAX_ACTIVATION_INL_H_ #include #include -#include "./softmax_activation-inl.h" +#include "../softmax_activation-inl.h" namespace mxnet { namespace op { -class CuDNNSoftmaxActivationOp : public Operator { +class CuDNNSoftmaxActivationOp { public: - explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) { - this->param_ = param; - init_cudnn_ = false; + CuDNNSoftmaxActivationOp() { dtype_ = CUDNN_DATA_FLOAT; + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); + } + + void Init(SoftmaxActivationParam param) { + this->param_ = param; } ~CuDNNSoftmaxActivationOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2) + CHECK_EQ(in_data.ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0], - in_data[softmax_activation::kData].shape_[1], 1, 1); - data = in_data[softmax_activation::kData].get_with_shape(dshape, s); - out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_data.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_data[softmax_activation::kData].Size(); + index_t size_left = in_data.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data[softmax_activation::kData].ndim()) { - dshape[i] = in_data[softmax_activation::kData].shape_[i]; + if (i < in_data.ndim()) { + dshape[i] = in_data.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[softmax_activation::kData].get_with_shape(dshape, s); - out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } float alpha = 1.0f; float beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - if (!init_cudnn_) { - init_cudnn_ = true; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - } + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_, CUDNN_SOFTMAX_ACCURATE, softmax_mode, @@ -109,19 +100,10 @@ class CuDNNSoftmaxActivationOp : public Operator { out.dptr_)); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); float alpha = 1.0f; float beta = 0.0f; Stream *s = ctx.get_stream(); @@ -131,31 +113,30 @@ class CuDNNSoftmaxActivationOp : public Operator { Tensor input_grad; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2) + CHECK_EQ(in_grad.ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0], - in_grad[softmax_activation::kData].shape_[1], 1, 1); - grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); - output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1); + grad = out_grad.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_grad.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_grad[softmax_activation::kData].Size(); + index_t size_left = in_grad.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad[softmax_activation::kData].ndim()) { - dshape[i] = in_grad[softmax_activation::kData].shape_[i]; + if (i < in_grad.ndim()) { + dshape[i] = in_grad.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); - grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); @@ -173,11 +154,10 @@ class CuDNNSoftmaxActivationOp : public Operator { } private: - bool init_cudnn_; cudnnDataType_t dtype_; cudnnTensorDescriptor_t shape_desc_; SoftmaxActivationParam param_; }; // class CuDNNSoftmaxActivationOp } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CUDNN_SOFTMAX_ACTIVATION_INL_H_ +#endif // MXNET_OPERATOR_NN_CUDNN_CUDNN_SOFTMAX_ACTIVATION_INL_H_ diff --git a/src/operator/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h similarity index 60% rename from src/operator/deconvolution-inl.h rename to src/operator/nn/deconvolution-inl.h index 41fcf9bfa77b..4f6b0664644c 100644 --- a/src/operator/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -22,8 +22,8 @@ * \brief * \author Wei Wu */ -#ifndef MXNET_OPERATOR_DECONVOLUTION_INL_H_ -#define MXNET_OPERATOR_DECONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ #include #include @@ -33,8 +33,8 @@ #include #include #include -#include "./operator_common.h" -#include "./linalg.h" +#include "../operator_common.h" +#include "../linalg.h" namespace mxnet { @@ -192,19 +192,18 @@ namespace mxnet { namespace op { template -class DeconvolutionOp : public Operator { +class DeconvolutionOp { public: - explicit DeconvolutionOp(DeconvolutionParam p) { + void Init(DeconvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(real_t); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; @@ -308,19 +307,18 @@ class DeconvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; // TODO(bing): check the BLAS Handle, be careful CHECK_EQ(out_grad.size(), 1U); size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); // get data @@ -453,300 +451,52 @@ class DeconvolutionOp : public Operator { }; // class DeconvolutionOp template -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class DeconvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - if (param_.adj.ndim() == 0) param_.adj = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); - } - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { -#if MXNET_USE_CUDNN == 0 - if (param_.kernel.ndim() != 2) { - LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported"; - return false; - } -#endif // CUDNN - - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - out_shape->resize(1, TShape()); - const TShape &dshape = (*in_shape)[deconv::kData]; - if (dshape.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - - index_t o_pad[1]; - index_t o_adj[1]; - param_.InferPad(dshape_ncw, o_pad, o_adj); - - CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; - - Shape<3> oshape; - oshape[0] = dshape_ncw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + - dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshape.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(dshape_nchw[1], - param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - - index_t o_pad[2]; - index_t o_adj[2]; - param_.InferPad(dshape_nchw, o_pad, o_adj); - - CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; - - Shape<4> oshape; - oshape[0] = dshape_nchw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + - dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + - dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshape.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - - index_t o_pad[3]; - index_t o_adj[3]; - param_.InferPad(dshape_ncdhw, o_pad, o_adj); - - CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d deconvolution"; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; - CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; - - Shape<5> oshape; - oshape[0] = dshape_ncdhw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + - dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + - dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; - oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + - dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - if (param_.target_shape[2] > 0) { - CHECK_EQ(param_.target_shape[2], oshape[4]) \ - << "param_.target_shape[2] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new DeconvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Deconvolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]}; - } +void _DeconvolutionCompute(const DeconvolutionParam& param, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + static thread_local DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +template +void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + _DeconvolutionCompute(param, ctx, inputs, req, outputs); +} - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +template +void _DeconvolutionGradCompute(const DeconvolutionParam& param, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + static thread_local DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }); +} - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + _DeconvolutionGradCompute(param, ctx, inputs, req, outputs); +} - private: - DeconvolutionParam param_; -}; // class DeconvolutionProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_DECONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc new file mode 100644 index 000000000000..f336be779c1c --- /dev/null +++ b/src/operator/nn/deconvolution.cc @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file deconvolution.cc + * \brief + * \author Wei Wu +*/ + +#include "./deconvolution-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" + +namespace mxnet { +namespace op { + +static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); +#if MXNET_USE_CUDNN == 0 + if (param_.kernel.ndim() != 2) { + LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported"; + return false; + } +#endif // CUDNN + + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshape = (*in_shape)[deconv::kData]; + if (dshape.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + + index_t o_pad[1]; + index_t o_adj[1]; + param_.InferPad(dshape_ncw, o_pad, o_adj); + + CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; + + Shape<3> oshape; + oshape[0] = dshape_ncw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + + dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; + + if (param_.target_shape.ndim() > 0) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshape.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(dshape_nchw[1], + param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + + index_t o_pad[2]; + index_t o_adj[2]; + param_.InferPad(dshape_nchw, o_pad, o_adj); + + CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; + + Shape<4> oshape; + oshape[0] = dshape_nchw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + + dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + + dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; + + if (param_.target_shape.ndim() > 1) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshape.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + + index_t o_pad[3]; + index_t o_adj[3]; + param_.InferPad(dshape_ncdhw, o_pad, o_adj); + + CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d deconvolution"; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; + CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; + + Shape<5> oshape; + oshape[0] = dshape_ncdhw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + + dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + + dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; + oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + + dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; + + if (param_.target_shape.ndim() > 2) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + if (param_.target_shape[2] > 0) { + CHECK_EQ(param_.target_shape[2], oshape[4]) \ + << "param_.target_shape[2] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; + } +} + +static inline std::vector ListArguments(const DeconvolutionParam& param_) { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } +} + +static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; +} + +inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 3 : 4; + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), out_expected); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +static void DeconvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNDeconvolution_Forward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + DeconvolutionCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static void DeconvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNDeconvolution_Backward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + DeconvolutionGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + DeconvolutionParam param_; + param_.Init(attrs->dict); + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + if (param_.adj.ndim() == 0) param_.adj = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); + } + attrs->parsed = std::move(param_); +} + +struct DeconvolutionGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[deconv::kData]); + heads.push_back(n->inputs[deconv::kWeight]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +DMLC_REGISTER_PARAMETER(DeconvolutionParam); + +NNVM_REGISTER_OP(Deconvolution) +.describe("Computes 2D transposed convolution (aka fractionally strided convolution) of the " + "input tensor. This operation can be seen as the gradient of Convolution operation with " + "respect to its input. Convolution usually reduces the size of the input. Transposed " + "convolution works the other way, going from a smaller input to a larger output while " + "preserving the connectivity pattern.") +.set_num_inputs([](const NodeAttrs& attrs) { + const DeconvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(DeconvolutionParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return ListArguments(nnvm::get(attrs.parsed)); +}) +.set_attr("FInferShape", DeconvolutionShape) +.set_attr("FInferType", DeconvolutionType) +.set_attr("FInferStorageType", DeconvStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr("FCompute", DeconvolutionCompute) +.set_attr("FComputeEx", DeconvolutionCompute_CPU) +.set_attr("FGradient", DeconvolutionGrad{"_backward_Deconvolution"}) +.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") +.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") +.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " + "operation.") +.add_arguments(DeconvolutionParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Deconvolution) +.set_num_outputs([](const NodeAttrs& attrs) { + const DeconvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_DeconvStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(DeconvolutionParamParser) +.set_attr("FCompute", DeconvolutionGradCompute) +.set_attr("FComputeEx", DeconvolutionGradCompute_CPU); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu new file mode 100644 index 000000000000..5a59fae3d0b4 --- /dev/null +++ b/src/operator/nn/deconvolution.cu @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file deconvolution.cu + * \brief + * \author Wei Wu +*/ + +#include "./deconvolution-inl.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn/cudnn_deconvolution-inl.h" +#endif // MXNET_USE_CUDNN + +namespace mxnet { +namespace op { + +template +static DeconvolutionOp &get_op(const DeconvolutionParam& param) { + static thread_local DeconvolutionOp op; + op.Init(param); + return op; +} + +template +static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param, + int forward_compute_type, int backward_compute_type, + const std::vector& in_shape, const std::vector& out_shape, + const Context& ctx, bool backward) { + // Convolution forward has to be called before backward for this operator. + // So we can't make this operator thread local. backward might be called + // in another thread. + static CuDNNDeconvolutionOp op; + if (!backward) + op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); + return op; +} + +template<> +void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + int dtype = inputs[0].type_flag_; + // If 1D deconvolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + get_op(param).Forward(ctx, inputs, req, outputs); + }) + return; + } +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + get_op(param).Forward(ctx, inputs, req, outputs); + } else if (!CuDNNDeconvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << + "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; + get_op(param).Forward(ctx, inputs, req, outputs); + } else { + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) { + in_shape[i] = inputs[i].shape_; + } + get_cudnn_op(param, compute_type, compute_type, + in_shape, out_shape, ctx.run_ctx.ctx, false).Forward(ctx, inputs, req, outputs); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + get_op(param).Forward(ctx, inputs, req, outputs); + }) +#endif // MXNET_USE_CUDNN +} + +template<> +void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + int dtype = out_grad.type_flag_; + + // If 1D deconvolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + }) + return; + } +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + } else if (!CuDNNDeconvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << + "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + } else { + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) { + in_shape[i] = in_data[i].shape_; + } + get_cudnn_op(param, compute_type, compute_type, + in_shape, out_shape, ctx.run_ctx.ctx, true).Backward(ctx, + std::vector{out_grad}, in_data, req, in_grad); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + }) +#endif // MXNET_USE_CUDNN +} + +NNVM_REGISTER_OP(Deconvolution) +.set_attr("FCompute", DeconvolutionCompute); + +NNVM_REGISTER_OP(_backward_Deconvolution) +.set_attr("FCompute", DeconvolutionGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h similarity index 91% rename from src/operator/depthwise_convolution-inl.h rename to src/operator/nn/depthwise_convolution-inl.h index e43fd08a26d3..0af8cae51c84 100644 --- a/src/operator/depthwise_convolution-inl.h +++ b/src/operator/nn/depthwise_convolution-inl.h @@ -22,12 +22,12 @@ * \brief CUDA depthwise convolution code * \author shuqian.qu@hobot.cc */ -#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_INL_H_ #include #include #include "./convolution-inl.h" -#include "../common/cuda_utils.h" +#include "../../common/cuda_utils.h" #if MXNET_USE_CUDA #include @@ -39,11 +39,11 @@ namespace mxnet { namespace op { using namespace tf::depthwise_conv; template -class DepthwiseConvolutionOp : public Operator { +class DepthwiseConvolutionOp { public: - explicit DepthwiseConvolutionOp(const ConvolutionParam& param, - const std::vector& in_shape, - const std::vector& out_shape) { + void Init(const ConvolutionParam& param, + const std::vector& in_shape, + const std::vector& out_shape) { args_.batch = in_shape[conv::kData][0]; args_.in_channel = in_shape[conv::kData][1]; args_.in_height = in_shape[conv::kData][2]; @@ -62,19 +62,16 @@ class DepthwiseConvolutionOp : public Operator { ~DepthwiseConvolutionOp() {} - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args); + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args); + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &req, + const std::vector &in_grad); private: DepthwiseArgs args_; @@ -282,8 +279,7 @@ template void DepthwiseConvolutionOp::Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); @@ -305,10 +301,8 @@ template void DepthwiseConvolutionOp::Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); @@ -350,4 +344,4 @@ void DepthwiseConvolutionOp::Backward(const OpContext &ctx, } // namespace mxnet #endif -#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_INL_H_ diff --git a/src/operator/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh similarity index 99% rename from src/operator/depthwise_convolution_tf.cuh rename to src/operator/nn/depthwise_convolution_tf.cuh index f94da4462297..e4dfd8292d2d 100644 --- a/src/operator/depthwise_convolution_tf.cuh +++ b/src/operator/nn/depthwise_convolution_tf.cuh @@ -24,10 +24,10 @@ * are different with origin version. * \author shuqian.qu@hobot.cc */ -#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ -#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ -#include "../common/cuda_utils.h" -#include "./mxnet_op.h" +#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ +#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ +#include "../../common/cuda_utils.h" +#include "../mxnet_op.h" namespace tf { namespace depthwise_conv { @@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream } // namespace depthwise_conv } // namespace tf -#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ +#endif // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ diff --git a/src/operator/dropout-inl.h b/src/operator/nn/dropout-inl.h similarity index 52% rename from src/operator/dropout-inl.h rename to src/operator/nn/dropout-inl.h index b2fb7823bedc..43700b29899d 100644 --- a/src/operator/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -23,8 +23,8 @@ * \author Bing Xu */ -#ifndef MXNET_OPERATOR_DROPOUT_INL_H_ -#define MXNET_OPERATOR_DROPOUT_INL_H_ +#ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_ +#define MXNET_OPERATOR_NN_DROPOUT_INL_H_ #include #include #include @@ -33,15 +33,17 @@ #include #include #include -#include "./operator_common.h" -#include "./mshadow_op.h" +#include "../operator_common.h" +#include "../mshadow_op.h" +#if 0 #if defined(USE_MKL) && defined(_OPENMP) #include #include #include #endif // USE_MKL && _OPENMP +#endif namespace dropout { enum DropoutOpInputs {kData}; @@ -53,6 +55,7 @@ enum DropoutOpMode {kTraining, kAlways}; namespace mxnet { namespace op { +#if 0 #if defined(USE_MKL) && defined(_OPENMP) static void bernoulli_generate(int n, double p, int* r) { int seed = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) @@ -74,6 +77,7 @@ static void bernoulli_generate(int n, double p, int* r) { } } #endif // USE_MKL && _OPENMP +#endif struct DropoutParam : public dmlc::Parameter { float p; @@ -91,18 +95,15 @@ struct DropoutParam : public dmlc::Parameter { }; // struct DropoutParam template -class DropoutOp : public Operator { +class DropoutOp { public: - explicit DropoutOp(DropoutParam param) { + void Init(const DropoutParam ¶m) { this->pkeep_ = 1.0f - param.p; this->mode_ = param.mode; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), 1U); @@ -114,6 +115,7 @@ class DropoutOp : public Operator { Tensor out = out_data[dropout::kOut].FlatTo2D(s); if (ctx.is_train || mode_ == dropout::kAlways) { Tensor mask = out_data[dropout::kMask].FlatTo2D(s); +#if 0 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) DType* outptr = out.dptr_; DType* dataptr = data.dptr_; @@ -124,33 +126,27 @@ class DropoutOp : public Operator { for (int i = 0; i < count; ++i) { outptr[i] = dataptr[i] * maskptr[i] * (1.0f / pkeep_); } -#else +#endif +#endif Random *prnd = ctx.requested[dropout::kRandom].get_random(s); mask = tcast(F( prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_)); Assign(out, req[dropout::kOut], data * mask); -#endif // USE_MKL && _OPENMP } else { Assign(out, req[dropout::kOut], F(data)); } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data_mask, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); - Tensor grad = out_grad[dropout::kOut].FlatTo2D(s); - Tensor mask = out_data[dropout::kMask].FlatTo2D(s); - Tensor gdata = in_grad[dropout::kData].FlatTo2D(s); + Tensor grad = out_grad.FlatTo2D(s); + Tensor mask = out_data_mask.FlatTo2D(s); + Tensor gdata = in_grad.FlatTo2D(s); if (ctx.is_train || mode_ == dropout::kAlways) { +#if 0 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) DType* ingradptr = gdata.dptr_; DType* outgradptr = grad.dptr_; @@ -162,11 +158,11 @@ class DropoutOp : public Operator { for (int i = 0; i < count; ++i) { ingradptr[i] = outgradptr[i] * maskptr[i] * (1.0f / pkeep_); } -#else // USE_MKL && _OPENMP - Assign(gdata, req[dropout::kData], grad * mask); -#endif // USE_MKL && _OPENMP +#endif +#endif + Assign(gdata, req, grad * mask); } else { - Assign(gdata, req[dropout::kData], F(grad)); + Assign(gdata, req, F(grad)); } } @@ -175,111 +171,38 @@ class DropoutOp : public Operator { int mode_; }; // class DropoutOp - template -Operator *CreateOp(DropoutParam param, int dtype); - -#if DMLC_USE_CXX11 -class DropoutProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_EQ(in_type->size(), 1U); - int dtype = in_type->at(0); - - if (dtype == -1) { - LOG(FATAL) << "input type to dropout is not specified."; - return false; - } - - size_t nout = this->ListOutputs().size(); - out_type->clear(); - for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new DropoutProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Dropout"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[dropout::kOut], out_data[dropout::kMask]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[dropout::kOut], in_grad[dropout::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[dropout::kData], out_data[dropout::kOut]}}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kRandom}; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 2; - } - - std::vector ListOutputs() const override { - return {"output", "mask"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } +void DropoutCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DropoutParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + static thread_local DropoutOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void DropoutGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DropoutParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + static thread_local DropoutOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); + }); +} - private: - DropoutParam param_; -}; // class DropoutProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_DROPOUT_INL_H_ +#endif // MXNET_OPERATOR_NN_DROPOUT_INL_H_ diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc new file mode 100644 index 000000000000..da4dd93f3e04 --- /dev/null +++ b/src/operator/nn/dropout.cc @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dropout.cc + * \brief + * \author Bing Xu +*/ + +#include "./dropout-inl.h" + +namespace mxnet { +namespace op { + +struct DropoutGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[0]); + heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +std::vector ListOutputs() { + return std::vector{"output", "mask"}; +} + +DMLC_REGISTER_PARAMETER(DropoutParam); + +NNVM_REGISTER_OP(Dropout) +.describe(R"(Applies dropout operation to input array. + +- During training, each element of the input is set to zero with probability p. + The whole array is rescaled by :math:`1/(1-p)` to keep the expected + sum of the input unchanged. + +- During testing, this operator does not change the input if mode is 'training'. + If mode is 'always', the same computaion as during training will be applied. + +Example:: + + random.seed(998) + input_array = array([[3., 0.5, -0.5, 2., 7.], + [2., -0.4, 7., 3., 0.2]]) + a = symbol.Variable('a') + dropout = symbol.Dropout(a, p = 0.2) + executor = dropout.simple_bind(a = input_array.shape) + + ## If training + executor.forward(is_train = True, a = input_array) + executor.outputs + [[ 3.75 0.625 -0. 2.5 8.75 ] + [ 2.5 -0.5 8.75 3.75 0. ]] + + ## If testing + executor.forward(is_train = False, a = input_array) + executor.outputs + [[ 3. 0.5 -0.5 2. 7. ] + [ 2. -0.4 7. 3. 0.2 ]] +)" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return ListOutputs(); +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + return 1; +}) +.set_attr("FInferShape", [](const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape){ + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; +}) +.set_attr("FInferType", [](const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + CHECK_EQ(in_type->size(), 1U); + int dtype = in_type->at(0); + + if (dtype == -1) { + LOG(FATAL) << "input type to dropout is not specified."; + return false; + } + + size_t nout = ListOutputs().size(); + out_type->clear(); + for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); + return true; +}) +.set_attr("FCompute", DropoutCompute) +.set_attr("FGradient", DropoutGrad{"_backward_Dropout"}) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kRandom}; +}) +.add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.") +.add_arguments(DropoutParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Dropout) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FCompute", DropoutGradCompute); + +} // namespace op +} // namespace mxnet + diff --git a/src/operator/dropout.cu b/src/operator/nn/dropout.cu similarity index 83% rename from src/operator/dropout.cu rename to src/operator/nn/dropout.cu index 5265d8013ff7..60c3a8e6a3f1 100644 --- a/src/operator/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -27,14 +27,13 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(DropoutParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DropoutOp(param); - }); - return op; -} + +NNVM_REGISTER_OP(Dropout) +.set_attr("FCompute", DropoutCompute); + +NNVM_REGISTER_OP(_backward_Dropout) +.set_attr("FCompute", DropoutGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h similarity index 55% rename from src/operator/fully_connected-inl.h rename to src/operator/nn/fully_connected-inl.h index c507e4251f3e..ce40197cd8c7 100644 --- a/src/operator/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -21,8 +21,8 @@ * \file fully_connect_op-inl.h * \brief fully connect operator and symbol */ -#ifndef MXNET_OPERATOR_FULLY_CONNECTED_INL_H_ -#define MXNET_OPERATOR_FULLY_CONNECTED_INL_H_ +#ifndef MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_ +#define MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_ #include #include @@ -31,9 +31,9 @@ #include #include #include -#include "./operator_common.h" -#include "./elemwise_op_common.h" -#include "linalg.h" +#include "../operator_common.h" +#include "../elemwise_op_common.h" +#include "../linalg.h" namespace mxnet { namespace op { @@ -65,24 +65,18 @@ struct FullyConnectedParam : public dmlc::Parameter { * \tparam xpu The device that the op will be executed on. */ template -class FullyConnectedOp : public Operator { +class FullyConnectedOp { public: - explicit FullyConnectedOp(FullyConnectedParam p) { + void Init(const FullyConnectedParam &p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; if (req[fullc::kOut] == kNullOp) return; CHECK_EQ(req[fullc::kOut], kWriteTo); - size_t expected = param_.no_bias ? 2 : 3; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(out_data.size(), 1U); // TODO(bing): check the BLAS Handle, be careful // maybe need blas handle from context // TODO(bing): judge shape to remove flatten op @@ -117,19 +111,11 @@ class FullyConnectedOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, + const std::vector &in_data, const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - size_t expected = param_.no_bias ? 2 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); // TODO(bing): check the BLAS Handle, be careful // maybe need blas handle from context Stream *s = ctx.get_stream(); @@ -176,124 +162,80 @@ class FullyConnectedOp : public Operator { linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); } + static FullyConnectedOp &get_op(const FullyConnectedParam& param) { + static thread_local FullyConnectedOp op; + op.Init(param); + return op; + } + private: FullyConnectedParam param_; }; // class FullyConnectedOp -// Decalre Factory function, used for dispatch specialization template -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class FullyConnectedProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); +void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), in_expected); + CHECK_EQ(outputs.size(), 1U); + int dtype = inputs[0].type_flag_; + + switch (dtype) { + case mshadow::kFloat32: + FullyConnectedOp::get_op(param).Forward(ctx, inputs, + req, outputs); + break; + case mshadow::kFloat64: + FullyConnectedOp::get_op(param).Forward(ctx, inputs, + req, outputs); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } +} - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - CHECK_EQ(out_shape->size(), 1U); - TShape dshape = (*in_shape)[fullc::kData]; - TShape oshape = (*out_shape)[0]; - // require data to be known - if (dshape.ndim() == 0) return false; - - index_t num_input; - if (!param_.flatten) { - num_input = dshape[dshape.ndim()-1]; - } else { - num_input = dshape.ProdShape(1, dshape.ndim()); - } - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input)); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden)); - } - - if (!param_.flatten) { - TShape result_shape(dshape); - result_shape[dshape.ndim()-1] = param_.num_hidden; - SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); - } else { - SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden)); - } - if (oshape.ndim() != 0) { - dshape[0] = oshape[0]; - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); - } - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - nnvm::NodeAttrs attrs; - attrs.name = "FullyConnected"; - return ElemwiseAttr( - attrs, in_type, out_type, -1); - } - - OperatorProperty* Copy() const override { - FullyConnectedProp* fc_sym = new FullyConnectedProp(); - fc_sym->param_ = this->param_; - return fc_sym; - } - - std::string TypeString() const override { - return "FullyConnected"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{in_data[fullc::kData], in_grad[fullc::kData]}}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; +template +void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), out_expected); + CHECK_EQ(req.size(), out_expected); + + std::vector out_grad{inputs[0]}; + std::vector in_data(inputs.begin() + 1, inputs.end()); + int dtype = inputs[0].type_flag_; + + switch (dtype) { + case mshadow::kFloat32: + FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, + req, outputs); + break; + case mshadow::kFloat64: + FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, + req, outputs); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - FullyConnectedParam param_; -}; // class FullyConnectedSymbol -#endif } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_FULLY_CONNECTED_INL_H_ +#endif // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc new file mode 100644 index 000000000000..2769ead61039 --- /dev/null +++ b/src/operator/nn/fully_connected.cc @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file fully_connected.cc + * \brief fully connect operator +*/ +#include "./fully_connected-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#if MXNET_USE_NNPACK == 1 +#include "./nnpack/nnpack_fully_connected-inl.h" +#endif // MXNET_USE_NNPACK + +namespace mxnet { +namespace op { + +static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + if (!param.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + CHECK_EQ(out_shape->size(), 1U); + TShape dshape = (*in_shape)[fullc::kData]; + TShape oshape = (*out_shape)[0]; + // require data to be known + if (dshape.ndim() == 0) return false; + + index_t num_input; + if (!param.flatten) { + num_input = dshape[dshape.ndim()-1]; + } else { + num_input = dshape.ProdShape(1, dshape.ndim()); + } + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input)); + if (!param.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden)); + } + + if (!param.flatten) { + TShape result_shape(dshape); + result_shape[dshape.ndim()-1] = param.num_hidden; + SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); + } else { + SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden)); + } + if (oshape.ndim() != 0) { + dshape[0] = oshape[0]; + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); + } + return true; +} + +void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const std::vector &req, + const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNFC_Forward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + FullyConnectedCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +void FullyConnectedGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, const std::vector &inputs, + const std::vector &req, const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNFC_Backward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + FullyConnectedGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static bool FullyConnectedType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + CHECK_GE(in_type->size(), 1U); + return ElemwiseAttr( + attrs, in_type, out_type, -1); +} + +struct FullyConnectedGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[fullc::kData]); + heads.push_back(n->inputs[fullc::kWeight]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_FCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), out_expected); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +DMLC_REGISTER_PARAMETER(FullyConnectedParam); + +NNVM_REGISTER_OP(FullyConnected) +.describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`. + +If ``flatten`` is set to be true, then the shapes are: + +- **data**: `(batch_size, x1, x2, ..., xn)` +- **weight**: `(num_hidden, x1 * x2 * ... * xn)` +- **bias**: `(num_hidden,)` +- **out**: `(batch_size, num_hidden)` + +If ``flatten`` is set to be false, then the shapes are: + +- **data**: `(x1, x2, ..., xn, input_dim)` +- **weight**: `(num_hidden, input_dim)` +- **bias**: `(num_hidden,)` +- **out**: `(x1, x2, ..., xn, num_hidden)` + +The learnable parameters include both ``weight`` and ``bias``. + +If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + +)code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FInferStorageType", FCStorageType) +.set_attr("FListInputNames", [](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + if (!params.no_bias) { + return std::vector{"data", "weight", "bias"}; + } else { + return std::vector{"data", "weight"}; + } +}) +.set_attr("FInferShape", FullyConnectedShape) +.set_attr("FInferType", FullyConnectedType) +.set_attr("FCompute", FullyConnectedCompute) +.set_attr("FComputeEx", FullyConnectedCompute_CPU) +.set_attr("FGradient", FullyConnectedGrad{"_backward_FullyConnected"}) +.add_argument("data", "NDArray-or-Symbol", "Input data.") +.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") +.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") +.add_arguments(FullyConnectedParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_FullyConnected) +.set_num_inputs(3) +.set_num_outputs([](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{1, 0}}; +}) +.set_attr("FInferStorageType", backward_FCStorageType) +.set_attr_parser(ParamParser) +.set_attr("FCompute", FullyConnectedGradCompute) +.set_attr("FComputeEx", FullyConnectedGradCompute_CPU); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu new file mode 100644 index 000000000000..0079a1e24fc5 --- /dev/null +++ b/src/operator/nn/fully_connected.cu @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file fully_connected.cu + * \brief fully connect operator +*/ +#include "./fully_connected-inl.h" +namespace mxnet { +namespace op { + +template<> +void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), in_expected); + CHECK_EQ(outputs.size(), 1U); + int dtype = inputs[0].type_flag_; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + FullyConnectedOp::get_op(param).Forward(ctx, inputs, + req, outputs); + }); +} + +template<> +void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), out_expected); + CHECK_EQ(req.size(), out_expected); + + std::vector out_grad{inputs[0]}; + std::vector in_data(inputs.begin() + 1, inputs.end()); + int dtype = inputs[0].type_flag_; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, + req, outputs); + }); +} + +NNVM_REGISTER_OP(FullyConnected) +.set_attr("FCompute", FullyConnectedCompute); + +NNVM_REGISTER_OP(_backward_FullyConnected) +.set_attr("FCompute", FullyConnectedGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h new file mode 100644 index 000000000000..b368913a61a3 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_act-inl.h + * \brief + * \author Da Zheng +*/ + +#ifndef MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ +#define MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../operator_common.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 + +#include + +namespace mxnet { +namespace op { + +static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { + switch (param.act_type) { + case activation::kReLU: + return mkldnn::algorithm::eltwise_relu; + case activation::kSigmoid: + return mkldnn::algorithm::eltwise_logistic; + case activation::kTanh: + return mkldnn::algorithm::eltwise_tanh; + case activation::kSoftReLU: + return mkldnn::algorithm::eltwise_soft_relu; + default: + LOG(FATAL) << "unknown activation type"; + return mkldnn::algorithm::eltwise_relu; + } +} + +template +void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, + const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + auto cpu_engine = data_mpd.get_engine(); + Dtype alpha = 0; + + auto alg = GetMKLDNNActAlgo(param); + mkldnn::eltwise_forward::desc desc = ctx.is_train + ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha) + : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, + alg, data_md, alpha); + mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); + + std::shared_ptr output_memory + = const_cast(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc()); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); + stream.Submit(); +} + +template +void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, + const NDArray &out_grad, const NDArray &in_data, const OpReqType &req, + const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + + std::shared_ptr diff_dst_memory = out_grad.GetMKLDNNData(); + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); + auto cpu_engine = data_mpd.get_engine(); + Dtype alpha = 0; + + auto alg = GetMKLDNNActAlgo(param); + mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha); + mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); + mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); + mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); + + auto diff_src_memory = CreateMKLDNNMem(in_grad, bw_pdesc.diff_src_primitive_desc(), req); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, + *diff_dst_memory, *diff_src_memory.second)); + CommitOutput(in_grad, diff_src_memory); + stream.Submit(); +} + +} // namespace op +} // namespace mxnet + +#endif +#endif // MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h new file mode 100644 index 000000000000..33b9884e6252 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -0,0 +1,270 @@ +/******************************************************************************* +* Copyright 2016-2017 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkldnn_base-inl.h +* \brief +* \author young.jin.kim@intel.com +* ashok.emani@intel.com +* deepthi.karkada@intel.com +* louis.feng@intel.com +* adam.d.straw@intel.com +* +*******************************************************************************/ + +#ifndef MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ +#define MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include +#include +#include "mkldnn.hpp" + +namespace mxnet { +extern bool EnableMkldnnWarnGenerated(); +// ===== CpuEngine ======================================= +// cpu_engine singleton +class CpuEngine { + public: + static CpuEngine & Instance() { + // I's thread-safe in C++11. + static thread_local CpuEngine myInstance; + return myInstance; + } + CpuEngine(CpuEngine const&) = delete; // Copy construct + CpuEngine(CpuEngine&&) = delete; // Move construct + CpuEngine& operator=(CpuEngine const&) = delete; // Copy assign + CpuEngine& operator=(CpuEngine &&) = delete; // Move assign + + mkldnn::engine & get_engine() { return _cpu_engine; } + protected: + CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {} + ~CpuEngine() {} + private: + mkldnn::engine _cpu_engine; +}; + +// type enumerator +template +struct data_type_enum {}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::f32 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s32 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s16 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s8 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::u8 }; +}; + +static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { + switch(dtype) { + case mshadow::kFloat32: + return mkldnn::memory::data_type::f32; + default: + return mkldnn::memory::data_type::data_undef; + } +} + +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) { + mkldnn::memory::dims dims(ndim); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = arr.shape()[i]; + return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()), + mkldnn::memory::format::any}; +} + +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { + return GetMemDesc(arr, arr.shape().ndim()); +} + +inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, + int num_groups) { + if (num_groups == 1) { + return GetMemDesc(arr); + } + else { + CHECK_EQ(arr.shape().ndim(), 4U); + mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, + (int) arr.shape()[0] / num_groups, (int) arr.shape()[1], + (int) arr.shape()[2], (int) arr.shape()[3]}; + return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()), + mkldnn::memory::format::any}; + } +} + +typedef std::shared_ptr mkldnn_mem_ptr; +typedef std::shared_ptr mkldnn_mem_const_ptr; + +class MKLDNNStream { + std::vector net; + // Here we hold all memory related to the operators in the stream. + std::vector mem_holder; +public: + static MKLDNNStream &Instance() { + static thread_local MKLDNNStream stream; + return stream; + } + + void RegisterPrim(const mkldnn::primitive &prim) { + net.push_back(prim); + } + + void RegisterMem(mkldnn_mem_const_ptr mem) { + mem_holder.push_back(mem); + } + + void Submit() { + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + net.clear(); + mem_holder.clear(); + } +}; + +inline static mkldnn_mem_ptr CreateMKLDNNMem(const mkldnn::memory::primitive_desc &desc) { + // TODO allocate memory more efficiently. + std::shared_ptr ret(new mkldnn::memory(desc)); + MKLDNNStream::Instance().RegisterMem(ret); + return ret; +} + +enum OutDataOp { + Noop, + CopyBack, + AddBack, +}; + +typedef std::pair mkldnn_output_t; + +static inline mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, OpReqType req) { + if (kAddTo == req) + return mkldnn_output_t(OutDataOp::AddBack, CreateMKLDNNMem(desc)); + else { + mkldnn_mem_ptr mem = const_cast(arr).CreateMKLDNNData(desc); + if (mem == nullptr) + return mkldnn_output_t(OutDataOp::CopyBack, CreateMKLDNNMem(desc)); + else + return mkldnn_output_t(OutDataOp::Noop, mem); + } +} + +namespace op { +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out); +} + +static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { + if (res.first == CopyBack) + const_cast(arr).CopyFrom(*res.second); + else if (res.first == AddBack) { + // TODO I might need to reorder. + mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); + CHECK(mem != nullptr); + // We have to allocate new memory for the sum result. + mkldnn_mem_ptr sum_res(new mkldnn::memory(res.second->get_primitive_desc())); + MKLDNNStream::Instance().RegisterMem(sum_res); + op::Sum(*res.second, *mem, *sum_res); + const_cast(arr).CopyFrom(*sum_res); + } +} + +inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, int num_groups) { + mkldnn_mem_const_ptr mem; + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); + auto engine = CpuEngine::Instance().get_engine(); + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } + else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } + else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } + else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return nullptr; + } + if (mem->get_primitive_desc() == target_pd) + return mem; + + std::shared_ptr ret = CreateMKLDNNMem(target_pd); + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; +} + +inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, + const mkldnn::engine &engine, int num_groups = 1) { + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + return arr.GetMKLDNNData(pd); + } + else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + return arr.GetMKLDNNData(pd); + } + else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + return arr.GetMKLDNNData(pd); + } + else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return nullptr; + } +} + +} // namespace mxnet +#endif +#endif // MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc new file mode 100644 index 000000000000..5e300ff0086e --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_concat.cc + * \brief + * \author Wenting Jiang +*/ +#include + +#include "../../concat-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + //printf("---- in MKLDNNConcat_Forward\n"); + const ConcatParam& param = nnvm::get(attrs.parsed); + int num_in_data = param.num_args; + int concat_dim = param.dim; + std::vector data_md; + std::vector data_mem; + for(int i =0; i < num_in_data; i++) { + std::shared_ptr tmp2 = in_data[i].GetMKLDNNData(); + auto tmp3 = tmp2->get_primitive_desc(); + data_md.push_back(tmp3); + data_mem.push_back(*tmp2); + } + mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); + + auto engine = CpuEngine::Instance().get_engine(); + auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut], + fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]); + + MKLDNNStream::Instance().RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second)); + + CommitOutput(out_data[concat_enum::kOut], out_mem); + MKLDNNStream::Instance().Submit(); +} + +void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs) { +//inputs: gz, inputs_0, inputs_1,... +//outputs.dim: inputs_0.dim, inputs_1.dim,... + const ConcatParam& param = nnvm::get(attrs.parsed); + int num_in_data = param.num_args; + int axis_ = param.dim; + auto engine = CpuEngine::Instance().get_engine(); + std::shared_ptrgz_mem = inputs[0].GetMKLDNNData(); + mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); + /* init the offset */ + mkldnn::memory::dims offsets = {0, 0, 0, 0}; + /*output*/ + //std::vector gradi_mem; + + for (int i = 0; i < num_in_data; i++) { + mkldnn::memory::dims diff_src_tz = {inputs[i+1].shape()[0], inputs[i+1].shape()[1], inputs[i+1].shape()[2], inputs[i+1].shape()[3]}; + auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); + auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); + //gradi_mem.push_back(gradi_mem_); + // create view from gy to gxs[i] + std::shared_ptr view_pd; + view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); + // create reorder primitive from gy to gxs[i] + //std::shared_ptr reorder_pd; + //reorder_pd.reset(new mkldnn::reorder::primitive_desc(view_pd.get()->dst_primitive_desc(), diff_src_mpd)); + mkldnn::reorder::primitive_desc reorder_pd(view_pd.get()->dst_primitive_desc(), diff_src_mpd); + //std::shared_ptr reorder_prim; + //reorder_prim.reset(new mkldnn::reorder(reorder_pd, *gz_mem, gradi_mem_)); + //std::unique_ptr reorder_prim(new mkldnn::reorder(reorder_pd, *gz_mem, gradi_mem_)); + offsets[axis_] += diff_src_tz[axis_]; + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(reorder_pd, *gz_mem, *gradi_mem_.second));//reorder_prim); + + CommitOutput(outputs[i], gradi_mem_); + } + MKLDNNStream::Instance().Submit(); +} +}//op +}//mxnet +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc new file mode 100644 index 000000000000..e152a29fc92f --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_convolution.cc + * \brief + * \author Da Zheng +*/ + +#include "../convolution-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +static mkldnn::convolution_forward::primitive_desc GetConvFwd( + const ConvolutionParam& param, bool is_train, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output) { + auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (param.dilate.ndim() == 0 && bias == nullptr) { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else if (param.dilate.ndim() == 0) { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + if (bias == nullptr) { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + } +} + +static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( + const ConvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (param.dilate.ndim() == 0) { + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); + } + else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); + } +} + +static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( + const ConvolutionParam& param, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (param.dilate.ndim() == 0 && bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else if (param.dilate.ndim() == 0) { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + if (bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + } +} + +void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, + ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], + param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); + auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd_pd.src_primitive_desc()); + auto engine = CpuEngine::Instance().get_engine(); + auto weight_mem = GetWeights(in_data[conv::kWeight], + fwd_pd.weights_primitive_desc(), param.num_group); + auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], + fwd_pd.dst_primitive_desc(), req[conv::kOut]); + + if (param.no_bias) { + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, + *data_mem, *weight_mem, *out_mem.second)); + } else { + auto bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd_pd.bias_primitive_desc()); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, + *data_mem, *weight_mem, *bias_mem, *out_mem.second)); + } + CommitOutput(out_data[conv::kOut], out_mem); + MKLDNNStream::Instance().Submit(); +} + +void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs) { + const std::vector &in_grad = outputs; + auto engine = CpuEngine::Instance().get_engine(); + const ConvolutionParam& param = nnvm::get(attrs.parsed); + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, + inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); + + CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; + if (req[conv::kData]) { + mkldnn::convolution_backward_data::primitive_desc bwdData_pd + = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + inputs[conv::kOut], fwd_pd); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdData_pd.diff_dst_primitive_desc()); + auto weight_mem = GetWeights(inputs[conv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group); + auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], + bwdData_pd.diff_src_primitive_desc(), req[conv::kData]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, + *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[conv::kData], in_grad_mem); + } + if (req[conv::kWeight]) { + mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd + = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut], fwd_pd); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = CreateMKLDNNMem(in_grad[conv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), req[conv::kWeight]); + mkldnn_output_t in_grad_bias; + if (param.no_bias) { + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); + } else { + in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias], + bwdWeights_pd.diff_bias_primitive_desc(), req[conv::kBias]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); + } + CommitOutput(in_grad[conv::kWeight], in_grad_weight); + CommitOutput(in_grad[conv::kBias], in_grad_bias); + } + MKLDNNStream::Instance().Submit(); +} + +} +} + +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc new file mode 100644 index 000000000000..7e5daf6ed251 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_deconvolution.cc + * \brief + * \author Da Zheng +*/ + +#include "../deconvolution-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_( + const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, + const mkldnn::memory::desc *bias_md, const mkldnn::memory::desc &out_md, + const mkldnn::engine &engine, const mkldnn::memory::dims &strides, + const mkldnn::memory::dims &padding) { + // TODO when dilate > 1 + if (bias_md == nullptr) { + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides, + padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else { + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, out_md, weights_md, + *bias_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } +} + +static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwd( + const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (bias) { + auto bias_md = GetMemDesc(*bias); + auto bwd_pd = GetDeconvBwd_(data_md, weight_md, &bias_md, + out_md, engine, strides, padding); + // TODO when dilate > 1 + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); + } + else { + auto bwd_pd = GetDeconvBwd_(data_md, weight_md, nullptr, out_md, engine, + strides, padding); + // TODO when dilate > 1 + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); + } +} + +static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( + const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + // TODO dilate + if (bias) { + auto bias_md = GetMemDesc(*bias); + return GetDeconvBwd_(data_md, weight_md, &bias_md, out_md, + engine, strides, padding); + } + else + return GetDeconvBwd_(data_md, weight_md, nullptr, out_md, + engine, strides, padding); +} + +static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( + const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else /*if (param.dilate.ndim() == 0)*/ { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, bias_md, data_md, strides, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } +// else { +// // TODO I should test the case with dilate. +// mkldnn::memory::dims dilates{0, 0}; +// if (param.dilate.ndim() == 2) { +// dilates[0] = param.dilate[0]; +// dilates[1] = param.dilate[1]; +// } +// if (bias_mem == nullptr) { +// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, out_md, strides, dilates, padding, padding, +// mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); +// } +// else { +// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, +// strides, dilates, padding, padding, mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); +// } +// } +} + +void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + + mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd( + param, in_data[deconv::kData], in_data[deconv::kWeight], + param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); + auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( + deconvFwd_pd.diff_dst_primitive_desc()); + auto weight_mem = GetWeights(in_data[deconv::kWeight], + deconvFwd_pd.weights_primitive_desc(), param.num_group); + auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], + deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); + + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data( + deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second)); + CommitOutput(out_data[deconv::kOut], out_mem); + MKLDNNStream::Instance().Submit(); + if (!param.no_bias) { + // add bias, broadcast bias to dim 1: channel + // TODO this is problematic if the layout isn't expected. + // we need to handle the type correctly. + typedef float DType; + Stream *s = ctx.get_stream(); + Tensor bias = in_data[deconv::kBias].data().get(s); + Tensor out_cpu = out_data[deconv::kOut].data().get(s); + out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_); + } +} + +void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs) { + const std::vector &in_grad = outputs; + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; + mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( + param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], nullptr, + inputs[deconv::kOut]); + if (req[deconv::kData]) { + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdData_pd.src_primitive_desc()); + auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group); + auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], + bwdData_pd.dst_primitive_desc(), req[deconv::kData]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(bwdData_pd, + *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[deconv::kData], in_grad_mem); + } + if (req[deconv::kWeight]) { + mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd + = GetDeconvBwdWeights(param, inputs[deconv::kData + 1], + inputs[deconv::kWeight + 1], + param.no_bias ? nullptr : &inputs[deconv::kWeight + 1], + inputs[deconv::kOut], bwdData_pd); + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); + auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); + auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), req[deconv::kWeight]); + mkldnn_output_t in_grad_bias; + if (param.no_bias) { + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); + } else { + in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias], + bwdWeights_pd.diff_bias_primitive_desc(), req[deconv::kBias]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second, + *in_grad_bias.second)); + } + CommitOutput(in_grad[deconv::kWeight], in_grad_weight); + CommitOutput(in_grad[deconv::kBias], in_grad_bias); + } + MKLDNNStream::Instance().Submit(); +} + +} +} + +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc new file mode 100644 index 000000000000..2a9e1ba4f7d8 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_fully_connected.cc + * \brief + * \author Da Zheng +*/ + +#include "../fully_connected-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( + const NDArray &data, const NDArray &weight, const NDArray *bias, + const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetMemDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); + mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, + data_md, weight_md, bias_md, out_md); + return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); + } + else { + mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, + data_md, weight_md, out_md); + return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); + } +} + +inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData( + const NDArray &data, const NDArray &weight, const NDArray &output, + mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetMemDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md); + return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd); +} + +inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights( + const NDArray &data, const NDArray &weight, const NDArray *bias, + const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetMemDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, + weight_md, bias_md, out_md); + return mkldnn::inner_product_backward_weights::primitive_desc( + ipBwdWeights_desc, engine, ipFwd_pd); + } + else { + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, + weight_md, out_md); + return mkldnn::inner_product_backward_weights::primitive_desc( + ipBwdWeights_desc, engine, ipFwd_pd); + } +} + +void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + const TShape& ishape = in_data[fullc::kData].shape(); + NDArray weight = in_data[fullc::kWeight]; + NDArray data = in_data[fullc::kData]; + if (data.shape().ndim() > 2 && !param.flatten) + data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); + else if (data.shape().ndim() > 2) + data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, + param.no_bias ? nullptr : &in_data[fullc::kBias], out_data[fullc::kOut]); + auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); + auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc()); + auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], + ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); + if (param.no_bias) { + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward( + ipFwd_pd, *data_mem, *weight_mem, *out_mem.second)); + } else { + auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, + *data_mem, *weight_mem, *bias_mem, *out_mem.second)); + } + CommitOutput(out_data[fullc::kOut], out_mem); + MKLDNNStream::Instance().Submit(); +} + +void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const std::vector &req, + const std::vector &outputs) { + const std::vector &in_grad = outputs; + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + const TShape& ishape = inputs[fullc::kData + 1].shape(); + const TShape& oshape = inputs[fullc::kOut].shape(); + + NDArray weight = inputs[fullc::kWeight + 1]; + NDArray data = inputs[fullc::kData + 1]; + if (data.shape().ndim() > 2 && !param.flatten) + data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); + else if (data.shape().ndim() > 2) + data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + NDArray out_grad = inputs[fullc::kOut]; + if (out_grad.shape().ndim() > 2 && !param.flatten) + out_grad = out_grad.Reshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1])); + else if (out_grad.shape().ndim() > 2) + out_grad = out_grad.Reshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim()))); + + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, + param.no_bias ? nullptr : &in_grad[fullc::kBias], out_grad); + + CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; + if (req[fullc::kData]) { + mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData( + data, weight, out_grad, ipFwd_pd); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder( + ipBwdData_pd.diff_dst_primitive_desc()); + auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc()); + auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], + ipBwdData_pd.diff_src_primitive_desc(), req[fullc::kData]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_data( + ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[fullc::kData], in_grad_mem); + } + if (req[fullc::kWeight]) { + mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd + = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], + out_grad, ipFwd_pd); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder( + ipBwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = CreateMKLDNNMem(in_grad[fullc::kWeight], + ipBwdWeights_pd.diff_weights_primitive_desc(), req[fullc::kWeight]); + mkldnn_output_t in_grad_bias; + if (param.no_bias) { + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); + } else { + in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias], + ipBwdWeights_pd.diff_bias_primitive_desc(), req[fullc::kBias]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); + } + CommitOutput(in_grad[fullc::kWeight], in_grad_weight); + CommitOutput(in_grad[fullc::kBias], in_grad_bias); + } + MKLDNNStream::Instance().Submit(); +} + +} +} +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h new file mode 100644 index 000000000000..00bdefaff210 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_ops-inl.h + * \brief + * \author Da Zheng +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +/* For fully connected. */ +void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const std::vector &req, + const std::vector &outputs); + +/* For convolution. */ +void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + +/* For deconvolution */ +void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + +/* For Concat */ +void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + +} +} +#endif // MXNET_USE_MKLDNN == 1 + +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ diff --git a/src/operator/softmax_activation.cu b/src/operator/nn/mkldnn/mkldnn_sum.cc similarity index 54% rename from src/operator/softmax_activation.cu rename to src/operator/nn/mkldnn/mkldnn_sum.cc index 5bebed2846b8..61ec1bbc4199 100644 --- a/src/operator/softmax_activation.cu +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -18,26 +18,35 @@ */ /*! - * \file softmax_activation.cu + * \file mkldnn_sum.cc * \brief - * \author Junyuan Xie + * \author Da Zheng */ -#include "./softmax_activation-inl.h" -#include "./mshadow_op.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_softmax_activation-inl.h" -#endif +#include + +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" +#if MXNET_USE_MKLDNN == 1 namespace mxnet { namespace op { -template<> -Operator *CreateOp(SoftmaxActivationParam param) { -#if MXNET_USE_CUDNN == 1 - return new CuDNNSoftmaxActivationOp(param); -#else - return new SoftmaxActivationOp(param); -#endif // MXNET_USE_CUDNN + +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out) { + std::vector input_pds(2); + std::vector scales(2); + std::vector inputs; + input_pds[0] = arr1.get_primitive_desc(); + input_pds[1] = arr2.get_primitive_desc(); + CHECK(input_pds[0] == input_pds[1]); + scales[0] = 1; + scales[1] = 1; + inputs.push_back(arr1); + inputs.push_back(arr2); + mkldnn::sum::primitive_desc sum_pd(scales, input_pds); + MKLDNNStream::Instance().RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); } -} // namespace op -} // namespace mxnet +} +} +#endif diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h new file mode 100644 index 000000000000..b061f6deb04b --- /dev/null +++ b/src/operator/nn/pooling-inl.h @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file pooling-inl.h + * \brief + * \author Bing Xu, Jun Wu +*/ + +#ifndef MXNET_OPERATOR_NN_POOLING_INL_H_ +#define MXNET_OPERATOR_NN_POOLING_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "./pool.h" + +namespace mxnet { +namespace op { + +struct PoolingParam : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape pad; + int pool_type; + int pooling_convention; + bool global_pool; + bool cudnn_off; + DMLC_DECLARE_PARAMETER(PoolingParam) { + DMLC_DECLARE_FIELD(global_pool).set_default(false) + .describe("Ignore kernel size, do global pooling based on current input feature map. "); + + DMLC_DECLARE_FIELD(cudnn_off).set_default(false) + .describe("Turn off cudnn pooling and use MXNet pooling operator. "); + + DMLC_DECLARE_FIELD(kernel) + .enforce_nonzero() + .describe("pooling kernel size: (y, x) or (d, y, x)"); + + DMLC_DECLARE_FIELD(pool_type) + .add_enum("max", pool_enum::kMaxPooling) + .add_enum("avg", pool_enum::kAvgPooling) + .add_enum("sum", pool_enum::kSumPooling) + .describe("Pooling type to be applied."); + + DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_enum::kValid) + .add_enum("full", pool_enum::kFull) + .add_enum("valid", pool_enum::kValid) + .describe("Pooling convention to be applied."); + + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .enforce_nonzero() + .describe("stride: for pooling (y, x) or (d, y, x)"); + + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for pooling: (y, x) or (d, y, x)"); + } +}; + +template +class PoolingOp { + public: + void Init(PoolingParam p) { + this->param_ = p; + } + + void Forward(const OpContext& ctx, const TBlob& in_data, + const OpReqType& req, const TBlob& out_data) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data.shape_; + + pool(s, in_data.dptr(), in_data.shape_, out_data.shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, req, out_data.dptr()); + } + + void Backward(const OpContext& ctx, const TBlob& out_grad, + const TBlob& in_data, const TBlob& out_data, + const OpReqType& req, const TBlob& in_grad) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data.shape_; + + unpool(s, out_grad.dptr(), in_data.dptr(), out_data.dptr(), + in_grad.shape_, out_grad.shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, req, in_grad.dptr()); + } + + private: + PoolingParam param_; +}; // class PoolingOp + +template +PoolingOp &GetPoolingOp(const PoolingParam ¶m) { + static thread_local PoolingOp op; + op.Init(param); + return op; +} + +template +void PoolingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; + } + }); +} + +template +void PoolingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Backward(ctx, + inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; + } + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_NN_POOLING_INL_H_ diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc new file mode 100644 index 000000000000..83e2accc18ca --- /dev/null +++ b/src/operator/nn/pooling.cc @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file pooling.cc + * \brief + * \author Bing Xu, Jun Wu +*/ +#include "./pooling-inl.h" +#include "../elemwise_op_common.h" +#if MXNET_USE_MKL2017 == 1 +#include +#include "./mkl/mkl_memory-inl.h" +#include "./mkl/mkl_pooling-inl.h" +#endif // MXNET_USE_MKL2017 +#if MXNET_USE_NNPACK == 1 +#include "./nnpack/nnpack_pooling-inl.h" +#endif // MXNET_USE_NNPACK + +namespace mxnet { +namespace op { + +static void PoolingParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + PoolingParam param_; + param_.Init(attrs->dict); + if (param_.kernel.ndim() == 1) { + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) + << "stride and kernel should have the same length"; + CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) + << "pad and kernel should have the same length"; + attrs->parsed = std::move(param_); +} + +static bool PoolingShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const PoolingParam& param_ = nnvm::get(attrs.parsed); + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; + TShape oshape = dshape; + if (dshape.ndim() == 0) return false; + if (param_.kernel.ndim() == 1) { + CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; + if (param_.global_pool) { + oshape[2] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 2) { + CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) + << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] + << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 3) { + CHECK_EQ(dshape.ndim(), 5U) + << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + oshape[4] = 1; + } else { + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / + param_.stride[2]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + oshape[4] = 1 + static_cast(ceil(static_cast( + dshape[4] + 2 * param_.pad[2] - + param_.kernel[2]) / param_.stride[2])); + } + } + + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } + return true; +} + +struct PoolingGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[pool_enum::kOut]); + heads.push_back(n->inputs[pool_enum::kData]); + heads.emplace_back(nnvm::NodeEntry{n, pool_enum::kOut, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +DMLC_REGISTER_PARAMETER(PoolingParam); + +NNVM_REGISTER_OP(Pooling) +.describe(R"code(Performs pooling on the input. + +The shapes for 1-D pooling are + +- **data**: *(batch_size, channel, width)*, +- **out**: *(batch_size, num_filter, out_width)*. + +The shapes for 2-D pooling are + +- **data**: *(batch_size, channel, height, width)* +- **out**: *(batch_size, num_filter, out_height, out_width)*, with:: + + out_height = f(height, kernel[0], pad[0], stride[0]) + out_width = f(width, kernel[1], pad[1], stride[1]) + +The definition of *f* depends on ``pooling_convention``, which has two options: + +- **valid** (default):: + + f(x, k, p, s) = floor((x+2*p-k)/s)+1 + +- **full**, which is compatible with Caffe:: + + f(x, k, p, s) = ceil((x+2*p-k)/s)+1 + +But ``global_pool`` is set to be true, then do a global pooling, namely reset +``kernel=(height, width)``. + +Three pooling options are supported by ``pool_type``: + +- **avg**: average pooling +- **max**: max pooling +- **sum**: sum pooling + +For 3-D pooling, an additional *depth* dimension is added before +*height*. Namely the input data will have shape *(batch_size, channel, depth, +height, width)*. + +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr_parser(PoolingParamParser) +.set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferShape", PoolingShape) +.set_attr("FCompute", PoolingCompute) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_Pooling"}) +.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.") +.add_arguments(PoolingParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Pooling) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ +#if MXNET_USE_CUDNN == 1 + return std::vector >(); +#else + return std::vector >{{1, 0}}; +#endif +}) +.set_attr_parser(PoolingParamParser) +.set_attr("FCompute", PoolingGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu new file mode 100644 index 000000000000..6f67def782d7 --- /dev/null +++ b/src/operator/nn/pooling.cu @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file pooling.cu + * \brief + * \author Bing Xu, Jun Wu +*/ +#include +#include "./pooling-inl.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn/cudnn_pooling-inl.h" +#endif // MXNET_USE_CUDNN + +namespace mxnet { +namespace op { + +#if MXNET_USE_CUDNN == 1 +template +static CuDNNPoolingOp &GetCuDNNPoolingOp(const PoolingParam ¶m) { + static thread_local CuDNNPoolingOp op; + op.Init(param); + return op; +} +#endif + +template<> +void PoolingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + +#if MXNET_USE_CUDNN == 1 + if (!param.cudnn_off && param.kernel.ndim() > 1) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + case pool_enum::kAvgPooling: + GetCuDNNPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + return; + case pool_enum::kSumPooling: + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; + break; + } + }); + } +#endif // MXNET_USE_CUDNN + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; + } + }); +} + +template<> +void PoolingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + +#if MXNET_USE_CUDNN == 1 + if (!param.cudnn_off && param.kernel.ndim() > 1) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + case pool_enum::kAvgPooling: + GetCuDNNPoolingOp(param).Backward(ctx, + inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + return; + case pool_enum::kSumPooling: + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; + break; + } + }); + } +#endif // MXNET_USE_CUDNN + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Backward(ctx, + inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; + } + }); +} + +NNVM_REGISTER_OP(Pooling) +.set_attr("FCompute", PoolingCompute); + +NNVM_REGISTER_OP(_backward_Pooling) +.set_attr("FCompute", PoolingGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h new file mode 100644 index 000000000000..8422ce73a5ce --- /dev/null +++ b/src/operator/nn/softmax_activation-inl.h @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file softmax_activation-inl.h + * \brief SoftmaxActivation operator + * \author Junyuan Xie +*/ +#ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ +#define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" + +namespace mxnet { +namespace op { +// Declare enumeration of input order to make code more intuitive. +// // These enums are only visible within this header +namespace softmax_activation { +enum SoftmaxActivationOpInputs {kData}; +enum SoftmaxActivationOpOutputs {kOut}; +enum SoftmaxActivationOpType {kInstance, kChannel}; +enum SoftmaxActivationOpResource {kTempSpace}; +} // softmax_activation + +struct SoftmaxActivationParam : public dmlc::Parameter { + // use int for enumeration + int mode; + DMLC_DECLARE_PARAMETER(SoftmaxActivationParam) { + DMLC_DECLARE_FIELD(mode) + .add_enum("instance", softmax_activation::kInstance) + .add_enum("channel", softmax_activation::kChannel) + .set_default(softmax_activation::kInstance) + .describe("Specifies how to compute the softmax. If set to ``instance``, " + "it computes softmax for each instance. If set to ``channel``, " + "It computes cross channel softmax for each position of each instance."); + } +}; + +/** + * \brief This is the implementation of softmax_activation operator. + * \tparam xpu The device that the op will be executed on. + */ +template +class SoftmaxActivationOp { + public: + void Init(SoftmaxActivationParam p) { + this->param_ = p; + } + + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + if (param_.mode == softmax_activation::kInstance) { + Tensor data = in_data.FlatTo2D(s); + Tensor out = out_data.FlatTo2D(s); + Softmax(out, data); + } else { + CHECK_GE(in_data.ndim(), 3) + << "Input need to have a least 3 dimensions when mode=channel"; + int n = in_data.size(0); + int k = in_data.size(1); + Shape<3> s3 = Shape3(n, k, static_cast(in_data.Size()/n/k)); + Tensor data = in_data.get_with_shape(s3, s); + Tensor out = out_data.get_with_shape(s3, s); + Softmax(out, data); + } + } + + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + // Use 3d tensor for both mode -> {instance, channel}. Get shapes + int total_size = in_grad.Size(); + int batch_size = in_grad.shape_[0]; + int channel_num = in_grad.shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); + // Get tensors + Stream *s = ctx.get_stream(); + Tensor m_out_grad = + out_grad.get_with_shape(data_shape, s); + Tensor m_out_data = + out_data.get_with_shape(data_shape, s); + Tensor m_in_grad = + in_grad.get_with_shape(data_shape, s); + // get requested temp space + Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( + Shape2(batch_size, rest_size), s); + workspace = reduce_with_axis(m_out_grad * m_out_data, 1); + Assign(m_in_grad, req, + m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); + } + + private: + SoftmaxActivationParam param_; +}; // class SoftmaxActivationOp + + +template +void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Forward(ctx, inputs[0], req[0], outputs[0]); +} + +template +void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ diff --git a/src/operator/softmax_activation.cc b/src/operator/nn/softmax_activation.cc similarity index 71% rename from src/operator/softmax_activation.cc rename to src/operator/nn/softmax_activation.cc index 115b0a730cde..a6452a6e8c65 100644 --- a/src/operator/softmax_activation.cc +++ b/src/operator/nn/softmax_activation.cc @@ -23,23 +23,15 @@ * \author Junyuan Xie */ #include "./softmax_activation-inl.h" -#include "./mshadow_op.h" +#include "../tensor/elemwise_unary_op.h" +#include "../mshadow_op.h" namespace mxnet { namespace op { -template<> -Operator *CreateOp(SoftmaxActivationParam param) { - return new SoftmaxActivationOp(param); -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const { - DO_BIND_DISPATCH(CreateOp, param_); -} DMLC_REGISTER_PARAMETER(SoftmaxActivationParam); -MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp) +MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation) .describe(R"code(Applies softmax activation to input. This is intended for internal layers. .. note:: @@ -64,8 +56,22 @@ Example:: [ 6.56221947e-03 5.95310994e-04 9.73919690e-01 1.78379621e-02 1.08472735e-03]] )code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") +.set_attr_parser(ParamParser) +.set_attr("FCompute", SoftmaxActivationCompute) +.set_attr("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"}) .add_arguments(SoftmaxActivationParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_SoftmaxActivation) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", SoftmaxActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu new file mode 100644 index 000000000000..a28b75d2bfab --- /dev/null +++ b/src/operator/nn/softmax_activation.cu @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file softmax_activation.cu + * \brief + * \author Junyuan Xie +*/ +#include "./softmax_activation-inl.h" +#include "../mshadow_op.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn/cudnn_softmax_activation-inl.h" +#endif + +namespace mxnet { +namespace op { + +template<> +void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + +#if MXNET_USE_CUDNN == 1 + static thread_local CuDNNSoftmaxActivationOp op; + op.Init(param); + op.Forward(ctx, inputs[0], req[0], outputs[0]); +#else + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Forward(ctx, inputs[0], req[0], outputs[0]); +#endif +} + +template<> +void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + +#if MXNET_USE_CUDNN == 1 + static thread_local CuDNNSoftmaxActivationOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +#else + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +#endif +} + +NNVM_REGISTER_OP(SoftmaxActivation) +.set_attr("FCompute", SoftmaxActivationCompute); + +NNVM_REGISTER_OP(_backward_SoftmaxActivation) +.set_attr("FCompute", SoftmaxActivationGradCompute); + +} // namespace op +} // namespace mxnet + diff --git a/src/operator/upsampling-inl.h b/src/operator/nn/upsampling-inl.h similarity index 52% rename from src/operator/upsampling-inl.h rename to src/operator/nn/upsampling-inl.h index 77ea13bd6ccc..5847a8eda7e7 100644 --- a/src/operator/upsampling-inl.h +++ b/src/operator/nn/upsampling-inl.h @@ -22,8 +22,8 @@ * \brief * \author Bing Xu */ -#ifndef MXNET_OPERATOR_UPSAMPLING_INL_H_ -#define MXNET_OPERATOR_UPSAMPLING_INL_H_ +#ifndef MXNET_OPERATOR_NN_UPSAMPLING_INL_H_ +#define MXNET_OPERATOR_NN_UPSAMPLING_INL_H_ #include #include @@ -33,7 +33,8 @@ #include #include #include -#include "./operator_common.h" +#include "../operator_common.h" +#include "./deconvolution-inl.h" namespace mxnet { namespace op { @@ -81,17 +82,16 @@ struct UpSamplingParam : public dmlc::Parameter { }; // struct UpSamplingParam template -class UpSamplingNearestOp : public Operator { +class UpSamplingNearestOp { public: - explicit UpSamplingNearestOp(UpSamplingParam p) { + void Init(UpSamplingParam p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), static_cast(param_.num_args)); @@ -124,19 +124,14 @@ class UpSamplingNearestOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, + void Backward(const OpContext &ctx, const TBlob &out_grad, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); CHECK_EQ(in_grad.size(), static_cast(param_.num_args)); Stream *s = ctx.get_stream(); - Tensor grad = out_grad[up_enum::kOut].get(s); + Tensor grad = out_grad.get(s); if (param_.num_args > 1) { int begin = 0; for (int i = 0; i < param_.num_args; ++i) { @@ -180,155 +175,68 @@ class UpSamplingNearestOp : public Operator { UpSamplingParam param_; }; // class UpSamplingNearestOp -template -Operator *CreateOp(UpSamplingParam param, int dtype); - - -#if DMLC_USE_CXX11 -class UpSamplingProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - std::vector ListArguments() const override { - if (param_.sample_type == up_enum::kNearest) { - std::vector ret; - for (int i = 0; i < param_.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; - } else { - return {"data", "weight"}; - } - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - CHECK_GE(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - TShape oshape = dshape; - if (param_.sample_type == up_enum::kNearest) { - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - oshape[1] = 0; - for (auto& shape : *in_shape) { - CHECK_EQ(shape.ndim(), 4U) << \ - "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; - int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; - CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ - "does not divide output height of " << oh; - CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ - "does not divide output width of " << ow; - if (param_.multi_input_mode == up_enum::kSum) { - CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ - "Number of channels must be the same when multi_input_mode==sum"; - oshape[1] = shape[1]; - } else { - oshape[1] += shape[1]; - } - } - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - CHECK_EQ(dshape.ndim(), 4U) << \ - "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; - if (dshape.ndim() == 0) return false; - int kernel = 2 * param_.scale - param_.scale % 2; - SHAPE_ASSIGN_CHECK(*in_shape, - up_enum::kWeight, - mshadow::Shape4(dshape[1], 1, kernel, kernel)); - oshape = dshape; - } - oshape[2] = dshape[2] * param_.scale; - oshape[3] = dshape[3] * param_.scale; - out_shape->clear(); - out_shape->push_back(oshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } +static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) { + DeconvolutionParam p = DeconvolutionParam(); + int kernel = 2 * param.scale - param.scale % 2; + int stride = param.scale; + int pad = static_cast(ceil((param.scale - 1) / 2.)); + p.workspace = param.workspace; + p.num_group = param.num_filter; + p.num_filter = param.num_filter; + p.no_bias = true; + int shape[] = {1, 1}; + p.dilate = TShape(shape, shape + 2); + shape[0] = shape[1] = kernel; + p.kernel = TShape(shape, shape + 2); + shape[0] = shape[1] = stride; + p.stride = TShape(shape, shape + 2); + shape[0] = shape[1] = pad; + p.pad = TShape(shape, shape + 2); + return p; +} - OperatorProperty* Copy() const override { - auto ptr = new UpSamplingProp(); - ptr->param_ = this->param_; - return ptr; - } - - std::string TypeString() const override { - return "UpSampling"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.sample_type == up_enum::kNearest) { - return {out_grad[up_enum::kOut]}; - } else { - return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]}; - } - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - if (param_.sample_type == up_enum::kNearest) { - return {}; - } else { - return {ResourceRequest::kTempSpace}; - } - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - if (param_.sample_type == up_enum::kNearest) { - return {}; - } else { - return {ResourceRequest::kTempSpace}; - } - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void UpSamplingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + if (param.sample_type == up_enum::kNearest) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + static thread_local UpSamplingNearestOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = GetDeconvolutionParam(param); + _DeconvolutionCompute(p, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Unknown sample type"; + } +} +template +void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + if (param.sample_type == up_enum::kNearest) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + CHECK_EQ(inputs.size(), 1U); + static thread_local UpSamplingNearestOp op; + op.Init(param); + op.Backward(ctx, inputs[0], req, outputs); + }); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = GetDeconvolutionParam(param); + _DeconvolutionGradCompute(p, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Unknown sample type"; + } +} - private: - UpSamplingParam param_; -}; // class UpSamplingProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_UPSAMPLING_INL_H_ +#endif // MXNET_OPERATOR_NN_UPSAMPLING_INL_H_ diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc new file mode 100644 index 000000000000..a3564a59f529 --- /dev/null +++ b/src/operator/nn/upsampling.cc @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file upsampling_nearest.cc + * \brief + * \author Bing Xu +*/ + +#include "./upsampling-inl.h" +#include +#include "./deconvolution-inl.h" + +namespace mxnet { +namespace op { + +static bool UpSamplingShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const UpSamplingParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + TShape oshape = dshape; + if (param_.sample_type == up_enum::kNearest) { + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + oshape[1] = 0; + for (auto& shape : *in_shape) { + CHECK_EQ(shape.ndim(), 4U) << \ + "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; + int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; + CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ + "does not divide output height of " << oh; + CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ + "does not divide output width of " << ow; + if (param_.multi_input_mode == up_enum::kSum) { + CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ + "Number of channels must be the same when multi_input_mode==sum"; + oshape[1] = shape[1]; + } else { + oshape[1] += shape[1]; + } + } + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + CHECK_EQ(dshape.ndim(), 4U) << \ + "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; + if (dshape.ndim() == 0) return false; + int kernel = 2 * param_.scale - param_.scale % 2; + SHAPE_ASSIGN_CHECK(*in_shape, + up_enum::kWeight, + mshadow::Shape4(dshape[1], 1, kernel, kernel)); + oshape = dshape; + } + oshape[2] = dshape[2] * param_.scale; + oshape[3] = dshape[3] * param_.scale; + out_shape->clear(); + out_shape->push_back(oshape); + return true; +} + +static inline std::vector ListArguments(const UpSamplingParam& param) { + if (param.sample_type == up_enum::kNearest) { + std::vector ret; + for (int i = 0; i < param.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); + } + return ret; + } else { + return {"data", "weight"}; + } +} + +static bool UpSamplingType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; +} + +struct UpSamplingGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const UpSamplingParam& param_ = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + if (param_.sample_type != up_enum::kNearest) { + heads.push_back(n->inputs[up_enum::kData]); + heads.push_back(n->inputs[up_enum::kWeight]); + } + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +DMLC_REGISTER_PARAMETER(UpSamplingParam); + +NNVM_REGISTER_OP(UpSampling) +.describe("Performs nearest neighbor/bilinear up sampling to inputs.") +.set_num_inputs([](const NodeAttrs& attrs) { + const UpSamplingParam& params = nnvm::get(attrs.parsed); + return params.sample_type == up_enum::kNearest ? params.num_args : 2; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return ListArguments(nnvm::get(attrs.parsed)); +}) +.set_attr("FInferShape", UpSamplingShape) +.set_attr("FInferType", UpSamplingType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + const UpSamplingParam& param = nnvm::get(n.parsed); + if (param.sample_type == up_enum::kNearest) { + return std::vector(); + } else { + return std::vector{ResourceRequest::kTempSpace}; + } +}) +.set_attr("FCompute", UpSamplingCompute) +.set_attr("FGradient", UpSamplingGrad{"_backward_UpSampling"}) +.set_attr("key_var_num_args", "num_args") +.add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample") +.add_arguments(UpSamplingParam::__FIELDS__()) +.set_attr("FSetInputVarAttrOnCompose", + [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { + if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; + if (index == 1) { + var->attrs.dict["__init__"] = "[\"bilinear\", {}]"; + } + }); + +NNVM_REGISTER_OP(_backward_UpSampling) +.set_num_outputs([](const NodeAttrs& attrs) { + const UpSamplingParam& params = nnvm::get(attrs.parsed); + return params.sample_type == up_enum::kNearest ? params.num_args : 2; +}) +.set_attr("TIsBackward", true) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + const UpSamplingParam& param = nnvm::get(n.parsed); + if (param.sample_type == up_enum::kNearest) { + return std::vector(); + } else { + return std::vector{ResourceRequest::kTempSpace}; + } +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", UpSamplingGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/cudnn_batch_norm.cu b/src/operator/nn/upsampling.cu similarity index 74% rename from src/operator/cudnn_batch_norm.cu rename to src/operator/nn/upsampling.cu index c16fc0cac25b..9d9ebacbcb2c 100644 --- a/src/operator/cudnn_batch_norm.cu +++ b/src/operator/nn/upsampling.cu @@ -18,22 +18,22 @@ */ /*! - * \file cudnn_batch_norm.cu + * \file upsampling_nearest.cc * \brief - * \author Junyuan Xie + * \author Bing Xu */ -#include "./cudnn_batch_norm-inl.h" -#include +#include "./deconvolution-inl.h" +#include "./upsampling-inl.h" namespace mxnet { namespace op { -#if CUDNN_MAJOR == 4 -template<> -Operator *CreateOp_CuDNNv4(BatchNormParam param) { - return new CuDNNBatchNormOp(param); -} -#endif // CUDNN_MAJOR == 4 + +NNVM_REGISTER_OP(UpSampling) +.set_attr("FCompute", UpSamplingCompute); + +NNVM_REGISTER_OP(_backward_UpSampling) +.set_attr("FCompute", UpSamplingGradCompute); + } // namespace op } // namespace mxnet - diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h deleted file mode 100644 index fbc6981a7591..000000000000 --- a/src/operator/pooling-inl.h +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file pooling-inl.h - * \brief - * \author Bing Xu, Jun Wu -*/ - -#ifndef MXNET_OPERATOR_POOLING_INL_H_ -#define MXNET_OPERATOR_POOLING_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "./operator_common.h" -#include "./nn/pool.h" - -namespace mxnet { -namespace op { - -struct PoolingParam : public dmlc::Parameter { - TShape kernel; - TShape stride; - TShape pad; - int pool_type; - int pooling_convention; - bool global_pool; - bool cudnn_off; - DMLC_DECLARE_PARAMETER(PoolingParam) { - DMLC_DECLARE_FIELD(global_pool).set_default(false) - .describe("Ignore kernel size, do global pooling based on current input feature map. "); - - DMLC_DECLARE_FIELD(cudnn_off).set_default(false) - .describe("Turn off cudnn pooling and use MXNet pooling operator. "); - - DMLC_DECLARE_FIELD(kernel) - .enforce_nonzero() - .describe("pooling kernel size: (y, x) or (d, y, x)"); - - DMLC_DECLARE_FIELD(pool_type) - .add_enum("max", pool_enum::kMaxPooling) - .add_enum("avg", pool_enum::kAvgPooling) - .add_enum("sum", pool_enum::kSumPooling) - .describe("Pooling type to be applied."); - - DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_enum::kValid) - .add_enum("full", pool_enum::kFull) - .add_enum("valid", pool_enum::kValid) - .describe("Pooling convention to be applied."); - - DMLC_DECLARE_FIELD(stride).set_default(TShape()) - .enforce_nonzero() - .describe("stride: for pooling (y, x) or (d, y, x)"); - - DMLC_DECLARE_FIELD(pad).set_default(TShape()) - .describe("pad for pooling: (y, x) or (d, y, x)"); - } -}; - -template -class PoolingOp : public Operator { - public: - explicit PoolingOp(PoolingParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext& ctx, - const std::vector& in_data, - const std::vector& req, - const std::vector& out_data, - const std::vector& aux_args) { - using namespace mshadow; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[pool_enum::kData].shape_; - - pool(s, in_data[pool_enum::kData].dptr(), - in_data[pool_enum::kData].shape_, - out_data[pool_enum::kOut].shape_, - param_.global_pool? - TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) - : param_.kernel, - param_.pad, - param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, - req[pool_enum::kOut], - out_data[pool_enum::kOut].dptr()); - } - - virtual void Backward(const OpContext& ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& out_data, - const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { - using namespace mshadow; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[pool_enum::kData].shape_; - - unpool(s, out_grad[pool_enum::kOut].dptr(), - in_data[pool_enum::kData].dptr(), - out_data[pool_enum::kOut].dptr(), - in_grad[pool_enum::kData].shape_, - out_grad[pool_enum::kOut].shape_, - param_.global_pool? - TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) - : param_.kernel, - param_.pad, - param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, - req[pool_enum::kData], - in_grad[pool_enum::kData].dptr()); - } - - private: - PoolingParam param_; -}; // class PoolingOp - -template -Operator* CreateOp(PoolingParam param, int dtype); - - -#if DMLC_USE_CXX11 -class PoolingProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - } - CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) - << "stride and kernel should have the same length"; - CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) - << "pad and kernel should have the same length"; - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" - << " Or 4D in (batch, channel, y, x) " - << " Or 5D in (batch, channel, d, y, x)"; - TShape oshape = dshape; - if (dshape.ndim() == 0) return false; - if (param_.kernel.ndim() == 1) { - CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; - if (param_.global_pool) { - oshape[2] = 1; - } else { - CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - } - } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } else if (param_.kernel.ndim() == 2) { - CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; - if (param_.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - } else { - CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; - CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) - << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] - << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - } - } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } else if (param_.kernel.ndim() == 3) { - CHECK_EQ(dshape.ndim(), 5U) - << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; - CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; - if (param_.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - oshape[4] = 1; - } else { - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / - param_.stride[2]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - oshape[4] = 1 + static_cast(ceil(static_cast( - dshape[4] + 2 * param_.pad[2] - - param_.kernel[2]) / param_.stride[2])); - } - } - - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_EQ(in_type->size(), 1U); - int dtype = (*in_type)[0]; - - if (dtype == -1) { - LOG(FATAL) << "Input type to pooling is not specified."; - return false; - } - - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - PoolingProp *prop_sym = new PoolingProp(); - prop_sym->param_ = this->param_; - return prop_sym; - } - - std::string TypeString() const override { - return "Pooling"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], - out_data[pool_enum::kOut]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { -#if MXNET_USE_CUDNN == 1 - return {}; -#else - return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}}; -#endif - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - PoolingParam param_; -}; // class PoolingProp -#endif // DMLC_USE_CXX11 -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_POOLING_INL_H_ diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc deleted file mode 100644 index 98a3e076fad0..000000000000 --- a/src/operator/pooling.cc +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file pooling.cc - * \brief - * \author Bing Xu, Jun Wu -*/ -#include "./pooling-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_pooling-inl.h" -#endif // MXNET_USE_MKL2017 -#if MXNET_USE_NNPACK == 1 -#include "./nnpack/nnpack_pooling-inl.h" -#endif // MXNET_USE_NNPACK - -namespace mxnet { -namespace op { - -template<> -Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - if (param.kernel.ndim() == 2 - && ((param.pool_type == pool_enum::kMaxPooling) - || (param.pool_type == pool_enum::kAvgPooling))) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLPoolingOp(param); - case mshadow::kFloat64: - return new MKLPoolingOp(param); - default: - break; - } - } -#endif -#if MXNET_USE_NNPACK == 1 - // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention - // = kFull(note that the default value is kValid in MXNet) - if ((param.pool_type == pool_enum::kMaxPooling) - && (param.pooling_convention == pool_enum::kFull) - && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2) - && (param.kernel[0] == 2) && (param.kernel[1] == 2) - && (param.stride[0] == 2) && (param.stride[1] == 2)) { - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKPoolingOp(param); - default: - break; - } - } -#endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - op = new PoolingOp(param); - } else { - LOG(FATAL) << "unknown pooling type"; - return NULL; - } - }); - - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); -} - -DMLC_REGISTER_PARAMETER(PoolingParam); - -MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp) -.describe(R"code(Performs pooling on the input. - -The shapes for 1-D pooling are - -- **data**: *(batch_size, channel, width)*, -- **out**: *(batch_size, num_filter, out_width)*. - -The shapes for 2-D pooling are - -- **data**: *(batch_size, channel, height, width)* -- **out**: *(batch_size, num_filter, out_height, out_width)*, with:: - - out_height = f(height, kernel[0], pad[0], stride[0]) - out_width = f(width, kernel[1], pad[1], stride[1]) - -The definition of *f* depends on ``pooling_convention``, which has two options: - -- **valid** (default):: - - f(x, k, p, s) = floor((x+2*p-k)/s)+1 - -- **full**, which is compatible with Caffe:: - - f(x, k, p, s) = ceil((x+2*p-k)/s)+1 - -But ``global_pool`` is set to be true, then do a global pooling, namely reset -``kernel=(height, width)``. - -Three pooling options are supported by ``pool_type``: - -- **avg**: average pooling -- **max**: max pooling -- **sum**: sum pooling - -For 3-D pooling, an additional *depth* dimension is added before -*height*. Namely the input data will have shape *(batch_size, channel, depth, -height, width)*. - -)code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.") -.add_arguments(PoolingParam::__FIELDS__()); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu deleted file mode 100644 index 950f09956258..000000000000 --- a/src/operator/pooling.cu +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file pooling.cu - * \brief - * \author Bing Xu, Jun Wu -*/ -#include -#include "./pooling-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_pooling-inl.h" -#endif // MXNET_USE_CUDNN - -namespace mxnet { -namespace op { - -template<> -Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_CUDNN == 1 - if (!param.cudnn_off && param.kernel.ndim() > 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new CuDNNPoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new CuDNNPoolingOp(param); - break; - case pool_enum::kSumPooling: - LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; - break; - } - }); - } - if (op) return op; -#endif // MXNET_USE_CUDNN - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - op = new PoolingOp(param); - } else { - LOG(FATAL) << "unknown pooling type"; - } - }); - return op; -} - -} // namespace op -} // namespace mxnet diff --git a/src/operator/softmax_activation-inl.h b/src/operator/softmax_activation-inl.h deleted file mode 100644 index b1b76930b483..000000000000 --- a/src/operator/softmax_activation-inl.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file softmax_activation-inl.h - * \brief SoftmaxActivation operator - * \author Junyuan Xie -*/ -#ifndef MXNET_OPERATOR_SOFTMAX_ACTIVATION_INL_H_ -#define MXNET_OPERATOR_SOFTMAX_ACTIVATION_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "./operator_common.h" - -namespace mxnet { -namespace op { -// Declare enumeration of input order to make code more intuitive. -// // These enums are only visible within this header -namespace softmax_activation { -enum SoftmaxActivationOpInputs {kData}; -enum SoftmaxActivationOpOutputs {kOut}; -enum SoftmaxActivationOpType {kInstance, kChannel}; -enum SoftmaxActivationOpResource {kTempSpace}; -} // softmax_activation - -struct SoftmaxActivationParam : public dmlc::Parameter { - // use int for enumeration - int mode; - DMLC_DECLARE_PARAMETER(SoftmaxActivationParam) { - DMLC_DECLARE_FIELD(mode) - .add_enum("instance", softmax_activation::kInstance) - .add_enum("channel", softmax_activation::kChannel) - .set_default(softmax_activation::kInstance) - .describe("Specifies how to compute the softmax. If set to ``instance``, " - "it computes softmax for each instance. If set to ``channel``, " - "It computes cross channel softmax for each position of each instance."); - } -}; - -/** - * \brief This is the implementation of softmax_activation operator. - * \tparam xpu The device that the op will be executed on. - */ -template -class SoftmaxActivationOp : public Operator { - public: - explicit SoftmaxActivationOp(SoftmaxActivationParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - if (param_.mode == softmax_activation::kInstance) { - Tensor data = in_data[softmax_activation::kData].FlatTo2D(s); - Tensor out = out_data[softmax_activation::kOut].FlatTo2D(s); - Softmax(out, data); - } else { - CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) - << "Input need to have a least 3 dimensions when mode=channel"; - int n = in_data[softmax_activation::kData].size(0); - int k = in_data[softmax_activation::kData].size(1); - Shape<3> s3 = Shape3(n, k, static_cast(in_data[softmax_activation::kData].Size()/n/k)); - Tensor data = - in_data[softmax_activation::kData].get_with_shape(s3, s); - Tensor out = - out_data[softmax_activation::kOut].get_with_shape(s3, s); - Softmax(out, data); - } - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1U); - // Use 3d tensor for both mode -> {instance, channel}. Get shapes - int total_size = in_grad[softmax_activation::kData].Size(); - int batch_size = in_grad[softmax_activation::kData].shape_[0]; - int channel_num = in_grad[softmax_activation::kData].shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); - // Get tensors - Stream *s = ctx.get_stream(); - Tensor m_out_grad = - out_grad[softmax_activation::kOut].get_with_shape(data_shape, s); - Tensor m_out_data = - out_data[softmax_activation::kOut].get_with_shape(data_shape, s); - Tensor m_in_grad = - in_grad[softmax_activation::kData].get_with_shape(data_shape, s); - // get requested temp space - Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( - Shape2(batch_size, rest_size), s); - workspace = reduce_with_axis(m_out_grad * m_out_data, 1); - Assign(m_in_grad, req[softmax_activation::kData], - m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); - } - - private: - SoftmaxActivationParam param_; -}; // class SoftmaxActivationOp - -// Decalre Factory function, used for dispatch specialization -template -Operator* CreateOp(SoftmaxActivationParam type); - -#if DMLC_USE_CXX11 -class SoftmaxActivationProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(softmax_activation::kData); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new SoftmaxActivationProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "SoftmaxActivation"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]}; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}}; - } - - Operator* CreateOperator(Context ctx) const override; - - private: - SoftmaxActivationParam param_; -}; -#endif // DMLC_USE_CXX11 -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_SOFTMAX_ACTIVATION_INL_H_ diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index ebe19d41bbc4..41b4eaa1aeca 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -324,6 +324,11 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, }); } +#if MXNET_USE_MKLDNN == 1 +void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns); +void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); +#endif + template void CastStorageComputeImpl(const OpContext& ctx, const NDArray& input, @@ -342,8 +347,15 @@ void CastStorageComputeImpl(const OpContext& ctx, } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { TBlob ret = output.data(); CastStorageCsrDnsImpl(ctx, input, &ret); +#if MXNET_USE_MKLDNN == 1 + } else if (src_stype == kMKLDNNStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageMKLDnsImpl(ctx, input, &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kMKLDNNStorage) { + CastStorageDnsMKLImpl(ctx, input, output); +#endif } else { - LOG(FATAL) << "Not implemented"; + LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype; } } diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 9f257b140f7b..9d6e2ec20759 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -25,10 +25,47 @@ #include "./cast_storage-inl.h" #include "../elemwise_op_common.h" #include "../tensor/elemwise_unary_op.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { +#if MXNET_USE_MKLDNN == 1 +static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { + switch(dtype) { + case mshadow::kFloat32: + return mkldnn::memory::data_type::f32; + default: + return mkldnn::memory::data_type::data_undef; + } +} + +static inline int get_type_size(int dtype) { + MSHADOW_TYPE_SWITCH(dtype, DType, {return sizeof(DType);}); + return -1; +} + +void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) { + CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); + CHECK(src.shape() == dns->shape_); + CHECK_EQ(src.dtype(), dns->type_flag_); + // This converts the source data to the default format and copy the data to + // the destination. + const TBlob &src_blob = src.data(); + memcpy(dns->dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns->type_flag_)); +} + +void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { + CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); + CHECK(dst.shape() == src.shape()); + CHECK_EQ(dst.dtype(), src.dtype()); + + std::vector net; + net.push_back(mkldnn::reorder(*src.GetMKLDNNData(), *dst.GetMKLDNNData())); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} +#endif + DMLC_REGISTER_PARAMETER(CastStorageParam); NNVM_REGISTER_OP(cast_storage) .add_alias("_sparse_cast_storage") diff --git a/src/operator/upsampling.cc b/src/operator/upsampling.cc deleted file mode 100644 index 653b5709f120..000000000000 --- a/src/operator/upsampling.cc +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file upsampling_nearest.cc - * \brief - * \author Bing Xu -*/ - -#include "./upsampling-inl.h" -#include -#include "./deconvolution-inl.h" - -namespace mxnet { -namespace op { -template<> -Operator *CreateOp(UpSamplingParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.sample_type == up_enum::kNearest) { - op = new UpSamplingNearestOp(param); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - op = new DeconvolutionOp(p); - } else { - LOG(FATAL) << "Unknown sample type"; - } - }); - return op; -} - -Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); -} - -DMLC_REGISTER_PARAMETER(UpSamplingParam); - -MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp) -.describe("Performs nearest neighbor/bilinear up sampling to inputs.") -.add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample") -.add_arguments(UpSamplingParam::__FIELDS__()) -.set_key_var_num_args("num_args"); - -NNVM_REGISTER_OP(UpSampling) -.set_attr("FSetInputVarAttrOnCompose", - [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { - if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; - if (index == 1) { - var->attrs.dict["__init__"] = "[\"bilinear\", {}]"; - } - }); -} // namespace op -} // namespace mxnet diff --git a/src/operator/upsampling.cu b/src/operator/upsampling.cu deleted file mode 100644 index 8152535233e4..000000000000 --- a/src/operator/upsampling.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file upsampling_nearest.cc - * \brief - * \author Bing Xu -*/ - -#include "./deconvolution-inl.h" -#include "./upsampling-inl.h" - -namespace mxnet { -namespace op { -template<> -Operator *CreateOp(UpSamplingParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.sample_type == up_enum::kNearest) { - op = new UpSamplingNearestOp(param); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - op = new DeconvolutionOp(p); - } else { - LOG(FATAL) << "Unknown sample type"; - } - }); - return op; -} - -} // namespace op -} // namespace mxnet diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 024e08983235..0396360d6830 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -925,7 +925,6 @@ def test_nearest_upsampling(): shapes = [(1,3,base*root_scale*scale**(num_shape-1-i),base*root_scale*scale**(num_shape-1-i)) for i in range(num_shape)] check_nearest_upsampling_with_shape(shapes, scale, root_scale) - @unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8044") def test_batchnorm_training(): def check_batchnorm_training(stype): @@ -1011,7 +1010,6 @@ def check_batchnorm_training(stype): for stype in stypes: check_batchnorm_training(stype) - def test_convolution_grouping(): num_filter = 4 num_group = 2