diff --git a/Makefile b/Makefile index a0bd046fbb7a..ea71cd3fff86 100644 --- a/Makefile +++ b/Makefile @@ -98,7 +98,7 @@ OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) CUSRC = $(wildcard src/*/*.cu) CUOBJ = $(patsubst src/%.cu, build/%_gpu.o, $(CUSRC)) -ifneq ($(EXTRA_OPERATORS), NONE) +ifneq ($(EXTRA_OPERATORS),) EXTRA_SRC = $(wildcard $(EXTRA_OPERATORS)/*.cc $(EXTRA_OPERATORS)/*/*.cc) EXTRA_OBJ = $(patsubst $(EXTRA_OPERATORS)/%.cc, $(EXTRA_OPERATORS)/build/%.o, $(EXTRA_SRC)) EXTRA_CUSRC = $(wildcard $(EXTRA_OPERATORS)/*.cu $(EXTRA_OPERATORS)/*/*.cu) @@ -114,9 +114,15 @@ LIB_DEP += $(DMLC_CORE)/libdmlc.a ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(LIB_DEP) ifeq ($(USE_CUDA), 1) ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) - LDFLAGS += -lnvrtc -lcuda + LDFLAGS += -lcuda endif +ifeq ($(USE_NVRTC), 1) + LDFLAGS += -lnvrtc + CFLAGS += -DMXNET_USE_NVRTC=1 +else + CFLAGS += -DMXNET_USE_NVRTC=0 +endif build/%.o: src/%.cc @@ -201,3 +207,6 @@ clean_all: clean -include build/*.d -include build/*/*.d +ifneq ($(EXTRA_OPERATORS),) + -include $(EXTRA_OPERATORS)/build/*.d +endif diff --git a/include/mxnet/mxrtc.h b/include/mxnet/mxrtc.h index e0418110277c..de8c385549bb 100644 --- a/include/mxnet/mxrtc.h +++ b/include/mxnet/mxrtc.h @@ -7,8 +7,7 @@ #ifndef MXNET_MXRTC_H_ #define MXNET_MXRTC_H_ #include "./base.h" -#if MXNET_USE_CUDA - +#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) #include #include @@ -88,5 +87,5 @@ class MXRtc { } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_USE_NVRTC #endif // MXNET_MXRTC_H_ diff --git a/make/config.mk b/make/config.mk index 6585e5299f5e..8e9f8af3a5da 100644 --- a/make/config.mk +++ b/make/config.mk @@ -48,6 +48,9 @@ USE_CUDA_PATH = NONE # whether use CUDNN R3 library USE_CUDNN = 0 +# whether use cuda runtime compiling for writing kernels in native language (i.e. Python) +USE_NVRTC = 0 + # whether use opencv during compilation # you can disable it, however, you will not able to use # imbin iterator diff --git a/make/osx.mk b/make/osx.mk index 13a6389bba04..23c2c7a363e5 100644 --- a/make/osx.mk +++ b/make/osx.mk @@ -48,6 +48,9 @@ USE_CUDA_PATH = NONE # whether use CUDNN R3 library USE_CUDNN = 0 +# whether use cuda runtime compiling for writing kernels in native language (i.e. Python) +USE_NVRTC = 0 + # whether use opencv during compilation # you can disable it, however, you will not able to use # imbin iterator diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py index cddb6a9a07fa..ef371b190808 100644 --- a/python/mxnet/initializer.py +++ b/python/mxnet/initializer.py @@ -4,8 +4,10 @@ import numpy as np from .base import string_types -from .ndarray import NDArray +from .ndarray import NDArray, load from . import random +import logging +import re class Initializer(object): """Base class for Initializer.""" @@ -75,6 +77,71 @@ def _init_default(self, name, _): raise ValueError('Unknown initialization pattern for %s' % name) # pylint: enable=no-self-use, missing-docstring, invalid-name +class Load(object): + """Initialize by loading pretrained param from file or dict + + Parameters + ---------- + param: str or dict of str->NDArray + param file or dict mapping name to NDArray. + default_init: Initializer + default initializer when name is not found in param. + verbose: bool + log source when initializing. + """ + def __init__(self, param, default_init=None, verbose=False): + if isinstance(param, str): + param = load(param) + assert isinstance(param, dict) + self.param = {} + for name, arr in param.items(): + if name.startswith('arg:'): + self.param[name[4:]] = arr + else: + self.param[name] = arr + self.default_init = default_init + self.verbose = verbose + + def __call__(self, name, arr): + if self.param.has_key(name): + assert arr.shape == self.param[name].shape, \ + 'Parameter %s cannot be initialized from loading. '%name + \ + 'Shape mismatch, target %s vs loaded %s'%(str(arr.shape), + self.param[name].shape) + arr[:] = self.param[name] + if self.verbose: + logging.info('Initialized %s by loading', name) + else: + assert self.default_init is not None, \ + "Cannot Initialize %s. Not found in loaded param " + \ + "and no default Initializer is provided." + self.default_init(name, arr) + if self.verbose: + logging.info('Initialized %s by default', name) + +class Mixed(object): + """Initialize with mixed Initializer + + Parameters + ---------- + patterns: list of str + list of regular expression patterns to match parameter names. + initializers: list of Initializer + list of Initializer corrosponding to patterns + """ + def __init__(self, patterns, initializers): + assert len(patterns) == len(initializers) + self.map = zip([re.compile(p) for p in patterns], initializers) + + def __call__(self, name, arr): + for prog, init in self.map: + if prog.match(name): + init(name, arr) + return + raise ValueError('Parameter name %s did not match any pattern. Consider' + + 'add a ".*" pattern at the and with default Initializer.') + + class Uniform(Initializer): """Initialize the weight with uniform [-scale, scale] diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py index 4f1e236fb212..6fb84cc3ee55 100644 --- a/python/mxnet/monitor.py +++ b/python/mxnet/monitor.py @@ -7,6 +7,7 @@ from . import ndarray import logging from math import sqrt +import re class Monitor(object): @@ -20,8 +21,13 @@ class Monitor(object): a function that computes statistics of tensors. Takes a NDArray and returns a NDArray. defaults to mean absolute value |x|/size(x). + pattern : str + A regular expression specifying which tensors to monitor. + Only tensors with names that match name_pattern will be included. + For example, '.*weight|.*output' will print all weights and outputs; + '.*backward.*' will print all gradients. """ - def __init__(self, interval, stat_func=None): + def __init__(self, interval, stat_func=None, pattern='.*', sort=False): if stat_func is None: def asum_stat(x): """returns |x|/size(x), async execution.""" @@ -33,9 +39,11 @@ def asum_stat(x): self.queue = [] self.step = 0 self.exes = [] + self.re_prog = re.compile(pattern) + self.sort = sort def stat_helper(name, array): """wrapper for executor callback""" - if not self.activated: + if not self.activated or not self.re_prog.match(name): return array = ctypes.cast(array, NDArrayHandle) array = NDArray(array, writable=False) @@ -73,23 +81,31 @@ def toc(self): Returns ------- res : list of """ - if self.activated: - for exe in self.exes: - for array in exe.arg_arrays: - array.wait_to_read() - for exe in self.exes: - for name, array in zip(exe._symbol.list_arguments(), exe.arg_arrays): - self.queue.append((self.step, name, self.stat_func(array))) - else: + if not self.activated: return [] + for exe in self.exes: + for array in exe.arg_arrays: + array.wait_to_read() + for exe in self.exes: + for name, array in zip(exe._symbol.list_arguments(), exe.arg_arrays): + if self.re_prog.match(name): + self.queue.append((self.step, name, self.stat_func(array))) self.activated = False res = [] - for n, k, v in self.queue: - assert isinstance(v, NDArray) - if v.shape == (1,): - res.append((n, k, str(v.asscalar()))) - else: - res.append((n, k, str(v.asnumpy()))) + if self.sort: + self.queue.sort(key=lambda x: x[1]) + for n, k, v_list in self.queue: + if isinstance(v_list, NDArray): + v_list = [v_list] + assert isinstance(v_list, list) + s = '' + for v in v_list: + assert isinstance(v, NDArray) + if v.shape == (1,): + s += str(v.asscalar()) + '\t' + else: + s += str(v.asnumpy()) + '\t' + res.append((n, k, s)) self.queue = [] return res diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 2c913d85ddf5..3deea52f9e9d 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1154,7 +1154,7 @@ int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output, NDArrayHandle* inputs, NDArrayHandle* outputs, char* kernel, RtcHandle *out) { API_BEGIN(); -#if MXNET_USE_CUDA +#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) std::vector > input, output; for (mx_uint i = 0; i < num_input; ++i) { input.push_back(std::pair(input_names[i], @@ -1167,8 +1167,8 @@ int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output, MXRtc *rtc = new MXRtc(name, input, output, kernel); *out = reinterpret_cast(rtc); #else - LOG(FATAL) << "Need to compile with USE_CUDA=1 for MXRtc."; -#endif // MXNET_USE_CUDA + LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc."; +#endif // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) API_END(); } @@ -1181,7 +1181,7 @@ int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output, mx_uint blockDimY, mx_uint blockDimZ) { API_BEGIN(); -#if MXNET_USE_CUDA +#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) std::vector input, output; for (mx_uint i = 0; i < num_input; ++i) { input.push_back(*reinterpret_cast(inputs[i])); @@ -1197,18 +1197,18 @@ int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output, blockDimY, blockDimZ); #else - LOG(FATAL) << "Need to compile with USE_CUDA=1 for MXRtc."; -#endif // MXNET_USE_CUDA + LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc."; +#endif // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) API_END(); } int MXRtcFree(RtcHandle handle) { API_BEGIN(); -#if MXNET_USE_CUDA +#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) delete reinterpret_cast(handle); #else - LOG(FATAL) << "Need to compile with USE_CUDA=1 for MXRtc."; -#endif // MXNET_USE_CUDA + LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc."; +#endif // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) API_END(); } diff --git a/src/common/mxrtc.cc b/src/common/mxrtc.cc index c23e5eacc94f..4fd687267409 100644 --- a/src/common/mxrtc.cc +++ b/src/common/mxrtc.cc @@ -5,10 +5,8 @@ * \author Junyuan Xie */ #include -#if MXNET_USE_CUDA - +#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) namespace mxnet { - const std::string MXRtc::str_type = "float"; std::unordered_map MXRtc::kernel_registry; @@ -139,4 +137,4 @@ char* MXRtc::compile(const std::string& name, const std::string& code) { } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC)) diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h index 0ac021627a04..513258f0cc61 100644 --- a/src/operator/upsampling-inl.h +++ b/src/operator/upsampling-inl.h @@ -24,6 +24,7 @@ namespace up_enum { enum UpSamplingOpInputs {kData, kWeight}; enum UpSamplingOpOutputs {kOut}; enum UpSamplingType {kNearest, kBilinear}; +enum UpSamplingMultiInputMode {kConcat, kSum}; } // namespace up_enum struct UpSamplingParam : public dmlc::Parameter { @@ -31,6 +32,7 @@ struct UpSamplingParam : public dmlc::Parameter { index_t num_filter; int sample_type; int num_args; + int multi_input_mode; DMLC_DECLARE_PARAMETER(UpSamplingParam) { DMLC_DECLARE_FIELD(scale) .set_range(1, 1000) @@ -42,6 +44,13 @@ struct UpSamplingParam : public dmlc::Parameter { .add_enum("nearest", up_enum::kNearest) .add_enum("bilinear", up_enum::kBilinear) .describe("upsampling method"); + DMLC_DECLARE_FIELD(multi_input_mode) + .add_enum("concat", up_enum::kConcat) + .add_enum("sum", up_enum::kSum) + .set_default(up_enum::kConcat) + .describe("How to handle multiple input. concat means concatenate upsampled " + "images along the channel dimension. sum means add all images together, " + "only available for nearest neighbor upsampling."); DMLC_DECLARE_FIELD(num_args).set_lower_bound(1) .describe("Number of inputs to be upsampled. For nearest neighbor " "upsampling, this can be 1-N; the size of output will be" @@ -66,6 +75,9 @@ class UpSamplingNearestOp : public Operator { using namespace mshadow::expr; CHECK_EQ(in_data.size(), param_.num_args); CHECK_EQ(out_data.size(), 1); + if (req[up_enum::kOut] == kNullOp) { + return; + } Stream *s = ctx.get_stream(); Tensor out = out_data[up_enum::kOut].get(s); if (param_.num_args > 1) { @@ -74,7 +86,15 @@ class UpSamplingNearestOp : public Operator { Tensor data = in_data[i].get(s); int end = begin + data.size(1); int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); - Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); + if (param_.multi_input_mode == up_enum::kSum) { + if (i == 0) { + Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); + } else { + out += upsampling_nearest(data, scale); + } + } else { + Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); + } begin = end; } } else { @@ -103,12 +123,21 @@ class UpSamplingNearestOp : public Operator { mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); int end = begin + input_grad.size(1); int scale = grad.size(2)/in_shape[0]; - Assign(input_grad, req[i], - pool(slice<1>(grad, begin, end), - in_shape, - scale, - scale, - scale)); + if (param_.multi_input_mode == up_enum::kSum) { + Assign(input_grad, req[i], + pool(grad, + in_shape, + scale, + scale, + scale)); + } else { + Assign(input_grad, req[i], + pool(slice<1>(grad, begin, end), + in_shape, + scale, + scale, + scale)); + } begin = end; } } else { @@ -171,7 +200,13 @@ class UpSamplingProp : public OperatorProperty { "does not divide output height of " << oh; CHECK_EQ(ow%shape[3], 0) << "UpSamplingNearest: input weight of " << shape[3] << \ "does not divide output weight of " << ow; - oshape[1] += shape[1]; + if (param_.multi_input_mode == up_enum::kSum) { + CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ + "Number of channels must be the same when multi_input_mode==sum"; + oshape[1] = shape[1]; + } else { + oshape[1] += shape[1]; + } } } else { CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";