From ed0954717048d76e2dc63206c26cb7ab138a69d7 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 19 Dec 2019 00:20:09 +0530
Subject: [PATCH 01/25] [MXNET-978] Higher Order Gradient Support `arcsin`,
 `arccos`. (#15515)

* support arcsin, arccos for higher order grad

* add relevant tests

* add small note for computation

* update comments

* use NodeOpGen

* retrigger CI

* address comment

* rename grad_x -> x_grad

* retrigger CI

* retrigger CI
---
 src/operator/tensor/elemwise_unary_op_trig.cc | 53 ++++++++++++++++++-
 .../python/unittest/test_higher_order_grad.py | 38 +++++++++++++
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/elemwise_unary_op_trig.cc b/src/operator/tensor/elemwise_unary_op_trig.cc
index a436ebb284a3..e5d662a1b262 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cc
+++ b/src/operator/tensor/elemwise_unary_op_trig.cc
@@ -188,7 +188,31 @@ The storage type of ``arcsin`` output depends upon the input storage type:
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsin" });
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arcsin,
-                                                  unary_bwd<mshadow_op::arcsin_grad>);
+                                                  unary_bwd<mshadow_op::arcsin_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // ograds[0]: head_grad_grads (dL/dxgrad)
+      // inputs[0]: dL/dy
+      // inputs[1]: x (ElemwiseGradUseIn)
+      // f(x) = arcsin(x)
+      // n: f'(x) = 1/(1-x^2)^1/2
+      // f''(x) = f'(x) * x/(1-x^2)
+      // Note: x/(1-x^2) = x * f'(x)^2
+      auto dydx = n->inputs[0];
+      auto x = n->inputs[1];
+      auto dydx_mul_grad_x = nnvm::NodeEntry{n};
+      auto op = mxnet::util::NodeOpGen{n};
+
+      auto x_grad = op.div(dydx_mul_grad_x, dydx);
+      auto x_grad_square = op.square(x_grad);
+      auto x_grad_square_mul_x = op.mul(x_grad_square, x);
+      auto x_grad_grad = op.mul(dydx_mul_grad_x, x_grad_square_mul_x);
+
+      std::vector<nnvm::NodeEntry> ret;
+      ret.emplace_back(op.mul(ograds[0], x_grad));
+      ret.emplace_back(op.mul(ograds[0], x_grad_grad));
+      return ret;
+    });
 
 // arccos
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(arccos, cpu, mshadow_op::arccos)
@@ -207,7 +231,32 @@ The storage type of ``arccos`` output is always dense
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arccos,
-                                                  unary_bwd<mshadow_op::arccos_grad>);
+                                                  unary_bwd<mshadow_op::arccos_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // ograds[0]: head_grad_grads (dL/dxgrad)
+      // inputs[0]: dL/dy
+      // inputs[1]: x (ElemwiseGradUseIn)
+      // f(x) = arccos(x)
+      // n: f'(x) = -1/(1-x^2)^1/2
+      // f''(x) = f'(x) * x/(1-x^2)
+      // Note: x/(1-x^2) = x * f'(x)^2
+      auto dydx = n->inputs[0];
+      auto x = n->inputs[1];
+      auto dydx_mul_grad_x = nnvm::NodeEntry{n};
+      auto op = mxnet::util::NodeOpGen{n};
+
+      auto x_grad = op.div(dydx_mul_grad_x, dydx);
+      auto x_grad_square = op.square(x_grad);
+      auto x_grad_square_mul_x = op.mul(x_grad_square, x);
+      auto x_grad_grad = op.mul(dydx_mul_grad_x, x_grad_square_mul_x);
+
+      std::vector<nnvm::NodeEntry> ret;
+      ret.emplace_back(op.mul(ograds[0], x_grad));
+      ret.emplace_back(op.mul(ograds[0], x_grad_grad));
+      return ret;
+    });
+
 
 // arctan
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(arctan, cpu, mshadow_op::arctan)
diff --git a/tests/python/unittest/test_higher_order_grad.py b/tests/python/unittest/test_higher_order_grad.py
index 527c35d5dd94..eeba4226dc36 100644
--- a/tests/python/unittest/test_higher_order_grad.py
+++ b/tests/python/unittest/test_higher_order_grad.py
@@ -133,6 +133,44 @@ def grad_grad_op(x):
             array, tanh, grad_grad_op, rtol=1e-6, atol=1e-6)
 
 
+@with_seed()
+def test_arcsin():
+    def arcsin(x):
+        return nd.arcsin(x)
+
+    def grad_grad_op(x):
+        return x / nd.sqrt((1-x**2)**3)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        # Hack: Decrease std_dev to make
+        # sure all elements
+        # are in range -1 to 1
+        # i.e. Domain of arcsin
+        array *= 0.2
+        check_second_order_unary(array, arcsin, grad_grad_op)
+
+
+@with_seed()
+def test_arccos():
+    def arccos(x):
+        return nd.arccos(x)
+
+    def grad_grad_op(x):
+        return -x / nd.sqrt((1-x**2)**3)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        # Hack: Decrease std_dev to make
+        # sure all elements
+        # are in range -1 to 1
+        # i.e. Domain of arccos
+        array *= 0.2
+        check_second_order_unary(array, arccos, grad_grad_op)
+
+
 @with_seed()
 def test_arctan():
     def arctan(x):

From a18250d57ecf34b1499e590b1eea9453d02ab05a Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 19 Dec 2019 07:55:08 +0800
Subject: [PATCH 02/25] Add silent option to quantization script (#17094)

* Add silent option to quantization script

* Refactor code

* Fix lint
---
 .../quantization/imagenet_gen_qsym_mkldnn.py  | 70 +++++++++------
 python/mxnet/contrib/quantization.py          | 90 +++++++++++--------
 .../quantization/quantize_graph_pass.cc       | 26 ++++--
 3 files changed, 117 insertions(+), 69 deletions(-)

diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index a4c1491039b9..130282714e30 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -140,17 +140,23 @@ def save_params(fname, arg_params, aux_params, logger=None):
                         help='If enabled, the quantize op will '
                              'be calibrated offline if calibration mode is '
                              'enabled')
+    parser.add_argument('--quiet', action='store_true', default=False,
+                        help='suppress most of log')
     args = parser.parse_args()
     ctx = mx.cpu(0)
-    logging.basicConfig()
-    logger = logging.getLogger('logger')
-    logger.setLevel(logging.INFO)
+    logger = None
+    if not args.quiet:
+        logging.basicConfig()
+        logger = logging.getLogger('logger')
+        logger.setLevel(logging.INFO)
 
-    logger.info(args)
-    logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
+    if logger:
+        logger.info(args)
+        logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
 
     calib_mode = args.calib_mode
-    logger.info('calibration mode set to %s' % calib_mode)
+    if logger:
+        logger.info('calibration mode set to %s' % calib_mode)
 
     # download calibration dataset
     if calib_mode != 'none':
@@ -158,13 +164,16 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
     # download model
     if not args.no_pretrained:
-        logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.')
-        logger.info('If you want to use custom model, please set --no-pretrained.')
+        if logger:
+            logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.')
+            logger.info('If you want to use custom model, please set --no-pretrained.')
         if args.model in ['imagenet1k-resnet-152', 'imagenet1k-inception-bn']:
-            logger.info('model %s is downloaded from MXNet modelzoo' % args.model)
+            if logger:
+                logger.info('model %s is downloaded from MXNet modelzoo' % args.model)
             prefix, epoch = download_model(model_name=args.model, logger=logger)
         else:
-            logger.info('model %s is converted from GluonCV' % args.model)
+            if logger:
+                logger.info('model %s is converted from GluonCV' % args.model)
             prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger)
             rgb_mean = '123.68,116.779,103.939'
             rgb_std = '58.393, 57.12, 57.375'
@@ -178,14 +187,16 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
     # get batch size
     batch_size = args.batch_size
-    logger.info('batch size = %d for calibration' % batch_size)
+    if logger:
+        logger.info('batch size = %d for calibration' % batch_size)
 
     # get number of batches for calibration
     num_calib_batches = args.num_calib_batches
-    if calib_mode == 'none':
-        logger.info('skip calibration step as calib_mode is none')
-    else:
-        logger.info('number of batches = %d for calibration' % num_calib_batches)
+    if logger:
+        if calib_mode == 'none':
+            logger.info('skip calibration step as calib_mode is none')
+        else:
+            logger.info('number of batches = %d for calibration' % num_calib_batches)
 
     # get number of threads for decoding the dataset
     data_nthreads = args.data_nthreads
@@ -195,7 +206,8 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
     exclude_first_conv = args.exclude_first_conv
     if args.quantized_dtype == "uint8":
-        logger.info('quantized dtype is set to uint8, will exclude first conv.')
+        if logger:
+            logger.info('quantized dtype is set to uint8, will exclude first conv.')
         exclude_first_conv = True
     excluded_sym_names = []
     if not args.no_pretrained:
@@ -242,42 +254,48 @@ def save_params(fname, arg_params, aux_params, logger=None):
         else:
             raise ValueError('Currently, model %s is not supported in this script' % args.model)
     else:
-        logger.info('Please set proper RGB configs for model %s' % args.model)
+        if logger:
+            logger.info('Please set proper RGB configs for model %s' % args.model)
         # add rgb mean/std of your model.
         rgb_mean = '0,0,0'
         rgb_std = '0,0,0'
         # add layer names you donnot want to quantize.
-        logger.info('Please set proper excluded_sym_names for model %s' % args.model)
+        if logger:
+            logger.info('Please set proper excluded_sym_names for model %s' % args.model)
         excluded_sym_names += ['layers']
         if exclude_first_conv:
             excluded_sym_names += ['layers']
 
-    logger.info('These layers have been excluded %s' % excluded_sym_names)
+    if logger:
+        logger.info('These layers have been excluded %s' % excluded_sym_names)
 
     label_name = args.label_name
-    logger.info('label_name = %s' % label_name)
+    if logger:
+        logger.info('label_name = %s' % label_name)
 
     data_shape = tuple([int(i) for i in image_shape.split(',')])
-    logger.info('Input data shape = %s' % str(data_shape))
-
-    logger.info('rgb_mean = %s' % rgb_mean)
+    if logger:
+        logger.info('Input data shape = %s' % str(data_shape))
+        logger.info('rgb_mean = %s' % rgb_mean)
+        logger.info('rgb_std = %s' % rgb_std)
     rgb_mean = [float(i) for i in rgb_mean.split(',')]
     mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
-    logger.info('rgb_std = %s' % rgb_std)
     rgb_std = [float(i) for i in rgb_std.split(',')]
     std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
     combine_mean_std = {}
     combine_mean_std.update(mean_args)
     combine_mean_std.update(std_args)
     if calib_mode == 'none':
-        logger.info('Quantizing FP32 model %s' % args.model)
+        if logger:
+            logger.info('Quantizing FP32 model %s' % args.model)
         qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params,
                                                               ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                               calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
                                                               logger=logger)
         sym_name = '%s-symbol.json' % (prefix + '-quantized')
     else:
-        logger.info('Creating ImageRecordIter for reading calibration dataset')
+        if logger:
+            logger.info('Creating ImageRecordIter for reading calibration dataset')
         data = mx.io.ImageRecordIter(path_imgrec=args.calib_dataset,
                                      label_width=1,
                                      preprocess_threads=data_nthreads,
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index b0714037bb42..01051ab7c8e4 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -188,8 +188,8 @@ def collect(self, name, arr):
             return
         handle = ctypes.cast(arr, NDArrayHandle)
         arr = NDArray(handle, writable=False).copyto(cpu()).asnumpy()
-        if self.logger is not None:
-            self.logger.info("Collecting layer %s histogram of shape %s" % (name, arr.shape))
+        if self.logger:
+            self.logger.debug("Collecting layer %s histogram of shape %s" % (name, arr.shape))
         min_range = np.min(arr)
         max_range = np.max(arr)
         th = max(abs(min_range), abs(max_range))
@@ -224,9 +224,9 @@ def collect(self, name, arr):
                                        max(cur_min_max[1], max_range))
         else:
             self.min_max_dict[name] = (min_range, max_range)
-        if self.logger is not None:
-            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
-                             % (name, min_range, max_range))
+        if self.logger:
+            self.logger.debug("Collecting layer %s min_range=%f, max_range=%f"
+                              % (name, min_range, max_range))
 
 def _calibrate_quantized_sym(qsym, th_dict):
     """Given a dictionary containing the thresholds for quantizing the layers,
@@ -358,18 +358,19 @@ def _get_optimal_thresholds(hist_dict, quantized_dtype, num_quantized_bins=255,
         else:
             th_dict[name] = (-th, th)
         del hist_dict[name]  # release the memory
-        if logger is not None:
-            logger.info('layer=%s, min_val=%f, max_val=%f, th=%f, divergence=%f'
-                        % (name, min_val, max_val, th, divergence))
+        if logger:
+            logger.debug('layer=%s, min_val=%f, max_val=%f, th=%f, divergence=%f'
+                         % (name, min_val, max_val, th, divergence))
     return th_dict
 
 
-def _load_sym(sym, logger=logging):
+def _load_sym(sym, logger=None):
     """Given a str as a path the symbol .json file or a symbol, returns a Symbol object."""
     if isinstance(sym, str):  # sym is a symbol file path
         cur_path = os.path.dirname(os.path.realpath(__file__))
         symbol_file_path = os.path.join(cur_path, sym)
-        logger.info('Loading symbol from file %s' % symbol_file_path)
+        if logger:
+            logger.info('Loading symbol from file %s' % symbol_file_path)
         return sym_load(symbol_file_path)
     elif isinstance(sym, Symbol):
         return sym
@@ -378,14 +379,15 @@ def _load_sym(sym, logger=logging):
                          ' while received type %s' % str(type(sym)))
 
 
-def _load_params(params, logger=logging):
+def _load_params(params, logger=None):
     """Given a str as a path to the .params file or a pair of params,
     returns two dictionaries representing arg_params and aux_params.
     """
     if isinstance(params, str):
         cur_path = os.path.dirname(os.path.realpath(__file__))
         param_file_path = os.path.join(cur_path, params)
-        logger.info('Loading params from file %s' % param_file_path)
+        if logger:
+            logger.info('Loading params from file %s' % param_file_path)
         save_dict = nd_load(param_file_path)
         arg_params = {}
         aux_params = {}
@@ -451,7 +453,7 @@ def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_examples=None,
-                   quantized_dtype='int8', quantize_mode='smart', logger=logging):
+                   quantized_dtype='int8', quantize_mode='smart', logger=None):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -530,7 +532,9 @@ def quantize_model(sym, arg_params, aux_params,
                          ' the names of the operators that will not be quantized,'
                          ' while received type %s' % str(type(excluded_op_names)))
 
-    logger.info('Quantizing symbol')
+    if logger:
+        os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
+        logger.info('Quantizing symbol')
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
@@ -561,21 +565,24 @@ def quantize_model(sym, arg_params, aux_params,
                                                                include_layer=calib_layer,
                                                                max_num_examples=num_calib_examples,
                                                                logger=logger)
-            logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
-            logger.info('Calculating optimal thresholds for quantization')
+            if logger:
+                logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
+                logger.info('Calculating optimal thresholds for quantization')
             th_dict = _get_optimal_thresholds(hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
             th_dict, num_examples = _collect_layer_output_min_max(
                 mod, calib_data, quantized_dtype, include_layer=calib_layer, max_num_examples=num_calib_examples,
                 logger=logger)
-            logger.info('Collected layer output min/max values from FP32 model using %d examples'
-                        % num_examples)
+            if logger:
+                logger.info('Collected layer output min/max values from FP32 model using %d examples'
+                            % num_examples)
         else:
             raise ValueError('unknown calibration mode %s received,'
                              ' expected `none`, `naive`, or `entropy`' % calib_mode)
         qsym = _calibrate_quantized_sym(qsym, th_dict)
 
-    logger.info('Quantizing parameters')
+    if logger:
+        logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params
@@ -584,7 +591,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
                           data_names=('data',), label_names=('softmax_label',),
                           ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_examples=None,
-                          quantized_dtype='int8', quantize_mode='smart', logger=logging):
+                          quantized_dtype='int8', quantize_mode='smart', logger=None):
     """User-level API for generating a fusion + quantized model from a FP32 model
     w/ or w/o calibration with Intel MKL-DNN.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -621,7 +628,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params,
 
 def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                    excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
-                   quantized_dtype='int8', quantize_mode='full', logger=logging):
+                   quantized_dtype='int8', quantize_mode='full', logger=None):
     """User-level API for generating a quantized model from a FP32 model w/o calibration
     and a collector for naive or entropy calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
@@ -676,7 +683,9 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                          ' while received type %s' % str(type(excluded_sym_names)))
     if not isinstance(ctx, Context):
         raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
-    logger.info('Quantizing graph')
+    if logger:
+        os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
+        logger.info('Quantizing graph')
     if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
                          ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
@@ -693,20 +702,24 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
         if calib_mode == 'entropy':
             collector = _LayerHistogramCollector(
                 include_layer=calib_layer, logger=logger)
-            logger.info(
-                'Create a layer output collector for entropy calibration.')
+            if logger:
+                logger.info(
+                    'Create a layer output collector for entropy calibration.')
         elif calib_mode == 'naive':
             collector = _LayerOutputMinMaxCollector(quantized_dtype=quantized_dtype,
                                                     include_layer=calib_layer, logger=logger)
-            logger.info(
-                'Create a layer output minmax collector for naive calibration')
+            if logger:
+                logger.info(
+                    'Create a layer output minmax collector for naive calibration')
         else:
             raise ValueError('unknown calibration mode %s received,'
                              ' expected `none`, `naive`, or `entropy`' % calib_mode)
-        logger.info('Collector created, please use set_monitor_callback'
-                    ' to collect calibration information.')
+        if logger:
+            logger.info('Collector created, please use set_monitor_callback'
+                        ' to collect calibration information.')
 
-    logger.info('Quantizing parameters')
+    if logger:
+        logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params, collector
@@ -751,7 +764,8 @@ def calib_graph(qsym, arg_params, aux_params, collector,
     th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
         if calib_mode == 'entropy':
-            logger.info('Calculating optimal thresholds for quantization')
+            if logger:
+                logger.info('Calculating optimal thresholds for quantization')
             th_dict = _get_optimal_thresholds(
                 collector.hist_dict, quantized_dtype, logger=logger)
         elif calib_mode == 'naive':
@@ -763,7 +777,8 @@ def calib_graph(qsym, arg_params, aux_params, collector,
     else:
         raise ValueError('please set calibration mode to naive or entropy.')
 
-    logger.info('Quantizing parameters')
+    if logger:
+        logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params
@@ -771,7 +786,7 @@ def calib_graph(qsym, arg_params, aux_params, collector,
 def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
                  exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
                  calib_data=None, data_shapes=None, calib_mode='none',
-                 num_calib_examples=None, ctx=cpu(), logger=logging):
+                 num_calib_examples=None, ctx=cpu(), logger=None):
     """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -825,7 +840,8 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
     -------
     """
 
-    logger.info('Export HybridBlock')
+    if logger:
+        logger.info('Export HybridBlock')
     network.hybridize()
     import mxnet as mx
     if calib_data is not None:
@@ -881,7 +897,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         for layers in list(symnet.get_internals()):
             if layers.name.find(name_match) != -1:
                 exclude_layers.append(layers.name)
-    logger.info('These layers have been excluded %s' % exclude_layers)
+    if logger:
+        logger.info('These layers have been excluded %s' % exclude_layers)
 
     if ctx == mx.cpu():
         symnet = symnet.get_backend_symbol('MKLDNN_QUANTIZE')
@@ -906,8 +923,9 @@ def __exit__(self, exc_type, exc_value, traceback):
             mod.set_params(args, auxs, allow_missing=False, force_init=True)
             num_examples = _collect_layer_statistics(mod, calib_data, collector,
                                                      num_calib_examples, logger)
-            logger.info('Collected layer output values from FP32 model using %d examples'
-                        % num_examples)
+            if logger:
+                logger.info('Collected layer output values from FP32 model using %d examples'
+                            % num_examples)
             qsym, qarg_params, aux_params = calib_graph(
                 qsym=qsym, arg_params=args, aux_params=auxs, collector=collector,
                 calib_mode=calib_mode, quantized_dtype=quantized_dtype, logger=logger)
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 182f6339308a..01365067ce93 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -275,12 +275,15 @@ Graph QuantizeGraph(Graph &&src) {
   std::unordered_map<Node*, NodePtr> mirror_map;
   std::unordered_map<NodePtr, NodePtr> reverse_mirror_map;
   nnvm::NodeEntryMap<NodeEntry> mirror_entry_map;
+  static int verbose = dmlc::GetEnv("MXNET_QUANTIZATION_VERBOSE", 0);
   DFSVisit(src.outputs, [&](const NodePtr& node) {
     NodePtr new_node = Node::Create();
     // If the currently visited node needs quantization, insert a quantize op node before the
     // current node and replace the current node with the quantized version in the new graph.
     if (quantized_node_map.count(node)) {
-      std::cout << node->attrs.name << " is quantized." << std::endl;
+      if (verbose) {
+        LOG(INFO) << node->attrs.name << " is quantized.";
+      }
       new_node = quantized_node_map[node];
 
       // add data into quantized op input
@@ -395,7 +398,8 @@ Graph QuantizeGraph(Graph &&src) {
       // (e.g., a quantized_conv2d node), and insert a dequantize op node in the new graph if there
       // are any. Otherwise, simply add a copy of the current node's entry to the inputs of
       // the new_node.
-      if (!node->is_variable()) std::cout << node->attrs.name << " is NOT quantized." << std::endl;
+      if (verbose && !node->is_variable())
+        LOG(INFO) << node->attrs.name << " is NOT quantized.";
       *new_node = *node;
       new_node->inputs.clear();
       for (const auto& e : node->inputs) {
@@ -516,15 +520,20 @@ static inline void SetCalibTableForEntry(
     out_data_name = out_data_name.substr(prefix.size());
   }
   const auto calib_table_iter = calib_table.find(out_data_name);
+  static int verbose = dmlc::GetEnv("MXNET_QUANTIZATION_VERBOSE", 0);
   if (calib_table_iter != calib_table.end()) {
-    std::cout << "Set calibration result to " << node->attrs.name
-              << " : min=" << calib_table_iter->second.first
-              << " max=" << calib_table_iter->second.second << std::endl;
+    if (verbose) {
+      LOG(INFO) << "Set calibration result to " << node->attrs.name
+                << " : min=" << calib_table_iter->second.first
+                << " max=" << calib_table_iter->second.second;
+    }
     node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
     node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
     if (node->op() && node->op()->attr_parser) node->op()->attr_parser(&(node->attrs));
   } else {
-    std::cout << "Can't find calibration result for " << node->attrs.name << std::endl;
+    if (verbose) {
+      LOG(INFO) << "Can't find calibration result for " << node->attrs.name;
+    }
   }
 }
 
@@ -535,7 +544,10 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
       Op::GetAttr<mxnet::FNeedCalibrateInput>("FNeedCalibrateInput");
   static const auto& need_calib_output_map =
       Op::GetAttr<mxnet::FNeedCalibrateOutput>("FNeedCalibrateOutput");
-  std::cout << "Set calibration result to quantized symbol." << std::endl;
+  static int verbose = dmlc::GetEnv("MXNET_QUANTIZATION_VERBOSE", 0);
+  if (verbose) {
+    LOG(INFO) << "Set calibration result to quantized symbol.";
+  }
   DFSVisit(g.outputs, [&](const NodePtr& node) {
     if (need_calib_input_map.count(node->op())) {
       const auto calib_idx = need_calib_input_map[node->op()](node->attrs);

From a7f33eb1e1a0e1b1959c5184363844c2b346536f Mon Sep 17 00:00:00 2001
From: Minghao Liu <40382964+Tommliu@users.noreply.github.com>
Date: Thu, 19 Dec 2019 14:15:37 +0800
Subject: [PATCH 03/25] numpy bincount (#16965)

---
 python/mxnet/ndarray/numpy/_op.py             |  59 ++++++-
 python/mxnet/numpy/multiarray.py              |  53 +++++-
 python/mxnet/numpy_dispatch_protocol.py       |   1 +
 python/mxnet/symbol/numpy/_symbol.py          |  36 +++-
 src/operator/numpy/np_bincount_op-inl.h       | 147 ++++++++++++++++
 src/operator/numpy/np_bincount_op.cc          | 133 +++++++++++++++
 src/operator/numpy/np_bincount_op.cu          | 160 ++++++++++++++++++
 .../unittest/test_numpy_interoperability.py   |  18 ++
 tests/python/unittest/test_numpy_op.py        |  50 ++++++
 9 files changed, 654 insertions(+), 3 deletions(-)
 create mode 100644 src/operator/numpy/np_bincount_op-inl.h
 create mode 100644 src/operator/numpy/np_bincount_op.cc
 create mode 100644 src/operator/numpy/np_bincount_op.cu

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index c111a95a707a..8d56c1f651a3 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -40,7 +40,7 @@
            'blackman', 'flip', 'around', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
            'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
            'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where']
+           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -5929,3 +5929,60 @@ def where(condition, x=None, y=None):
         return nonzero(condition)
     else:
         return _npi.where(condition, x, y, out=None)
+
+
+@set_module('mxnet.ndarray.numpy')
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    Parameters
+    ----------
+    x : ndarray
+        input array, 1 dimension, nonnegative ints.
+    weights: ndarray
+        input weigths same shape as x. (Optional)
+    minlength: int
+        A minimum number of bins for the output. (Optional)
+
+    Returns
+    --------
+    out : ndarray
+        the result of binning the input array. The length of out is equal to amax(x)+1.
+
+    Raises
+    --------
+    Value Error
+        If the input is not 1-dimensional, or contains elements with negative values,
+        or if minlength is negative
+    TypeError
+        If the type of the input is float or complex.
+
+    Examples
+    --------
+    >>> np.bincount(np.arange(5))
+    array([1, 1, 1, 1, 1])
+    >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
+    array([1, 3, 1, 1, 0, 0, 0, 1])
+
+    >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
+    >>> np.bincount(x).size == np.amax(x)+1
+    True
+
+    >>> np.bincount(np.arange(5, dtype=float))
+    Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
+    TypeError: array cannot be safely cast to required type
+
+    >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
+    >>> x = np.array([0, 1, 1, 2, 2, 2])
+    >>> np.bincount(x,  weights=w)
+    array([ 0.3,  0.7,  1.1])
+    """
+    if not isinstance(x, NDArray):
+        raise TypeError("Input data should be NDarray")
+    if minlength < 0:
+        raise ValueError("Minlength value should greater than 0")
+    if weights is None:
+        return _npi.bincount(x, minlength=minlength, has_weights=False)
+    return _npi.bincount(x, weights=weights, minlength=minlength, has_weights=True)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 5795c62942df..c3c6f4db8ba0 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -59,7 +59,7 @@
            'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity', 'take',
            'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal',
            'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where']
+           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 # Return code for dispatching indexing function call
 _NDARRAY_UNSUPPORTED_INDEXING = -1
@@ -7892,3 +7892,54 @@ def where(condition, x=None, y=None):
            [ 0.,  3., -1.]])
     """
     return _mx_nd_np.where(condition, x, y)
+
+
+@set_module('mxnet.numpy')
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    Parameters
+    ----------
+    x : ndarray
+        input array, 1 dimension, nonnegative ints.
+    weights: ndarray
+        input weigths same shape as x. (Optional)
+    minlength: int
+        A minimum number of bins for the output. (Optional)
+
+    Returns
+    --------
+    out : ndarray
+        the result of binning the input array. The length of out is equal to amax(x)+1.
+
+    Raises
+    --------
+    Value Error
+        If the input is not 1-dimensional, or contains elements with negative values,
+        or if minlength is negative
+    TypeError
+        If the type of the input is float or complex.
+
+    Examples
+    --------
+    >>> np.bincount(np.arange(5))
+    array([1, 1, 1, 1, 1])
+    >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
+    array([1, 3, 1, 1, 0, 0, 0, 1])
+
+    >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
+    >>> np.bincount(x).size == np.amax(x)+1
+    True
+
+    >>> np.bincount(np.arange(5, dtype=float))
+    Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
+    TypeError: array cannot be safely cast to required type
+
+    >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
+    >>> x = np.array([0, 1, 1, 2, 2, 2])
+    >>> np.bincount(x,  weights=w)
+    array([ 0.3,  0.7,  1.1])
+    """
+    return _mx_nd_np.bincount(x, weights=weights, minlength=minlength)
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index e93720564774..bd5c388a5100 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -147,6 +147,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'resize',
     'where',
     'full_like',
+    'bincount'
 ]
 
 
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index c61d5b2d393d..0fb0d538082d 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -48,7 +48,7 @@
            'blackman', 'flip', 'around', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
            'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
            'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where']
+           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -5468,4 +5468,38 @@ def load_json(json_str):
     return _Symbol(handle)
 
 
+@set_module('mxnet.symbol.numpy')
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    Parameters
+    ----------
+    x : _Symbol
+        input data
+    weights: _Symbol
+        input weigths same shape as x. (Optional)
+    minlength: int
+        A minimum number of bins for the output. (Optional)
+
+    Returns
+    --------
+    out : _Symbol
+        the result of binning the input data. The length of out is equal to amax(x)+1.
+
+    Raises:
+    --------
+    Value Error
+        If the input is not 1-dimensional, or contains elements with negative values,
+        or if minlength is negative
+    TypeError
+        If the type of the input is float or complex.
+    """
+    if minlength < 0:
+        raise ValueError("Minlength value should greater than 0")
+    if weights is None:
+        return _npi.bincount(x, minlength=minlength, has_weights=False)
+    return _npi.bincount(x, weights=weights, minlength=minlength, has_weights=True)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/numpy/np_bincount_op-inl.h b/src/operator/numpy/np_bincount_op-inl.h
new file mode 100644
index 000000000000..254ea8fdec22
--- /dev/null
+++ b/src/operator/numpy/np_bincount_op-inl.h
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_bicount_op-inl.h
+ * \brief numpy compatible bincount operator
+ */
+#ifndef MXNET_OPERATOR_NUMPY_NP_BINCOUNT_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_BINCOUNT_OP_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <utility>
+#include <vector>
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyBincountParam : public dmlc::Parameter<NumpyBincountParam> {
+  int minlength;
+  bool has_weights;
+  DMLC_DECLARE_PARAMETER(NumpyBincountParam) {
+    DMLC_DECLARE_FIELD(minlength)
+    .set_default(0)
+    .describe("A minimum number of bins for the output array"
+              "If minlength is specified, there will be at least this"
+              "number of bins in the output array");
+    DMLC_DECLARE_FIELD(has_weights)
+    .set_default(false)
+    .describe("Determine whether Bincount has weights.");
+  }
+};
+
+inline bool NumpyBincountType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  const NumpyBincountParam& param = nnvm::get<NumpyBincountParam>(attrs.parsed);
+  if (!param.has_weights) {
+    return ElemwiseType<1, 1>(attrs, in_attrs, out_attrs) && in_attrs->at(0) != -1;
+  } else {
+    CHECK_EQ(out_attrs->size(), 1U);
+    CHECK_EQ(in_attrs->size(), 2U);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1));
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
+    return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+  }
+}
+
+inline bool NumpyBincountStorageType(const nnvm::NodeAttrs& attrs,
+                                     const int dev_mask,
+                                     DispatchMode* dispatch_mode,
+                                     std::vector<int> *in_attrs,
+                                     std::vector<int> *out_attrs) {
+  const NumpyBincountParam& param = nnvm::get<NumpyBincountParam>(attrs.parsed);
+  if (param.has_weights) {
+    CHECK_EQ(in_attrs->size(), 2U);
+  } else {
+    CHECK_EQ(in_attrs->size(), 1U);
+  }
+  CHECK_EQ(out_attrs->size(), 1U);
+  for (int &attr : *in_attrs) {
+    CHECK_EQ(attr, kDefaultStorage) << "Only default storage is supported";
+  }
+  for (int &attr : *out_attrs) {
+    attr = kDefaultStorage;
+  }
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  return true;
+}
+
+template<typename xpu>
+void NumpyBincountForwardImpl(const OpContext &ctx,
+                              const NDArray &data,
+                              const NDArray &weights,
+                              const NDArray &out,
+                              const size_t &data_n,
+                              const int &minlength);
+
+template<typename xpu>
+void NumpyBincountForwardImpl(const OpContext &ctx,
+                              const NDArray &data,
+                              const NDArray &out,
+                              const size_t &data_n,
+                              const int &minlength);
+
+template<typename xpu>
+void NumpyBincountForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext &ctx,
+                          const std::vector<NDArray> &inputs,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<NDArray> &outputs) {
+  CHECK_GE(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK(req[0] == kWriteTo);
+  const NumpyBincountParam& param = nnvm::get<NumpyBincountParam>(attrs.parsed);
+  const bool has_weights = param.has_weights;
+  const int minlength = param.minlength;
+  const NDArray &data = inputs[0];
+  const NDArray &out = outputs[0];
+  CHECK_LE(data.shape().ndim(), 1U) << "Input only accept 1d array";
+  CHECK(!common::is_float(data.dtype())) <<"Input data should be int type";
+  size_t N = data.shape().Size();
+  if (N == 0) {
+    mshadow::Stream<xpu> *stream = ctx.get_stream<xpu>();
+    mxnet::TShape s(1, minlength);
+    const_cast<NDArray &>(out).Init(s);
+    MSHADOW_TYPE_SWITCH(out.dtype(), OType, {
+      mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
+        stream, minlength, out.data().dptr<OType>());
+    });
+  } else {
+    if (has_weights) {
+      CHECK_EQ(inputs.size(), 2U);
+      const NDArray &weights = inputs[1];
+      CHECK_EQ(data.shape(), weights.shape()) << "weights should has same size as input";
+      NumpyBincountForwardImpl<xpu>(ctx, data, weights, out, N, minlength);
+    } else {
+      NumpyBincountForwardImpl<xpu>(ctx, data, out, N, minlength);
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_BINCOUNT_OP_INL_H_
diff --git a/src/operator/numpy/np_bincount_op.cc b/src/operator/numpy/np_bincount_op.cc
new file mode 100644
index 000000000000..6256db176977
--- /dev/null
+++ b/src/operator/numpy/np_bincount_op.cc
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_bicount_op.cc
+ * \brief numpy compatible bincount operator CPU registration
+ */
+
+#include "./np_bincount_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+void BinNumberCount(const NDArray& data, const int& minlength,
+                    const NDArray& out, const size_t& N) {
+  int bin = minlength;
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+    DType* data_ptr = data.data().dptr<DType>();
+    for (size_t i = 0; i < N; i++) {
+      CHECK_GE(data_ptr[i], 0) << "input should be nonnegative number";
+      if (data_ptr[i] + 1 > bin) {
+        bin = data_ptr[i] + 1;
+      }
+    }
+  });  // bin number = max(max(data) + 1, minlength)
+  mxnet::TShape s(1, bin);
+  const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
+}
+
+template<typename DType, typename OType>
+void BincountCpuWeights(const DType* data, const OType* weights,
+                        OType* out, const size_t& data_n) {
+  for (size_t i = 0; i < data_n; i++) {
+    int target = data[i];
+    out[target] += weights[i];
+  }
+}
+
+template<typename DType, typename OType>
+void BincountCpu(const DType* data, OType* out, const size_t& data_n) {
+  for (size_t i = 0; i < data_n; i++) {
+    int target = data[i];
+    out[target] += 1;
+  }
+}
+
+template<>
+void NumpyBincountForwardImpl<cpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &weights,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  BinNumberCount(data, minlength, out, data_n);
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+      MSHADOW_TYPE_SWITCH(weights.dtype(), OType, {
+        size_t out_size = out.shape()[0];
+        Kernel<set_zero, cpu>::Launch(s, out_size, out.data().dptr<OType>());
+        BincountCpuWeights(data.data().dptr<DType>(), weights.data().dptr<OType>(),
+                          out.data().dptr<OType>(), data_n);
+      });
+    });
+}
+
+template<>
+void NumpyBincountForwardImpl<cpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  BinNumberCount(data, minlength, out, data_n);
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+      MSHADOW_TYPE_SWITCH(out.dtype(), OType, {
+        size_t out_size = out.shape()[0];
+        Kernel<set_zero, cpu>::Launch(s, out_size, out.data().dptr<OType>());
+        BincountCpu(data.data().dptr<DType>(), out.data().dptr<OType>(), data_n);
+      });
+    });
+}
+
+DMLC_REGISTER_PARAMETER(NumpyBincountParam);
+
+NNVM_REGISTER_OP(_npi_bincount)
+.set_attr_parser(ParamParser<NumpyBincountParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const NumpyBincountParam& params =
+    nnvm::get<NumpyBincountParam>(attrs.parsed);
+  return params.has_weights? 2 : 1;
+  })
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const NumpyBincountParam& params =
+    nnvm::get<NumpyBincountParam>(attrs.parsed);
+    return params.has_weights ?
+           std::vector<std::string>{"data", "weights"} :
+           std::vector<std::string>{"data"};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+[](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FInferType>("FInferType", NumpyBincountType)
+.set_attr<FInferStorageType>("FInferStorageType", NumpyBincountStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", NumpyBincountForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("data", "NDArray-or-Symbol", "Data")
+.add_argument("weights", "NDArray-or-Symbol", "Weights")
+.add_arguments(NumpyBincountParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_bincount_op.cu b/src/operator/numpy/np_bincount_op.cu
new file mode 100644
index 000000000000..ed1f90f00c16
--- /dev/null
+++ b/src/operator/numpy/np_bincount_op.cu
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_bicount_op.cu
+ * \brief numpy compatible bincount operator GPU registration
+ */
+
+#include "./np_bincount_op-inl.h"
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include "../tensor/util/tensor_util-inl.cuh"
+#include "../tensor/util/tensor_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct BincountFusedKernel {
+  template<typename DType, typename OType>
+  static MSHADOW_XINLINE void Map(int i, const DType* data, OType* out) {
+    int idx = data[i];
+    atomicAdd(&out[idx], 1);
+  }
+
+  template<typename DType, typename OType>
+  static MSHADOW_XINLINE void Map(int i, const DType* data, const OType* weights,
+                                  OType* out) {
+    int idx = data[i];
+    atomicAdd(&out[idx], weights[i]);
+  }
+};
+
+struct is_valid_check {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, char* invalid_ptr, const DType* data) {
+    if (data[i] < 0) *invalid_ptr = 1;
+  }
+};
+
+template<typename DType>
+bool CheckInvalidInput(mshadow::Stream<gpu> *s, const DType *data, const size_t& data_size,
+                       char* is_valid_ptr) {
+  using namespace mxnet_op;
+  int32_t is_valid = 0;
+  Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+  Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data);
+  CUDA_CALL(cudaMemcpyAsync(&is_valid, is_valid_ptr, sizeof(char),
+                            cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
+  CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+  return is_valid == 0;
+}
+
+template<>
+void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &weights,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+
+  MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
+    DType* h_ptr;
+    DType* d_ptr;
+    int bin = minlength;
+    d_ptr = data.data().dptr<DType>();
+    Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(1), s);
+    char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
+    bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
+    CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
+
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
+                              mshadow::Stream<gpu>::GetStream(s)));
+    CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
+    mxnet::TShape s(1, bin);
+    const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
+  });
+
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+      MSHADOW_TYPE_SWITCH(weights.dtype(), OType, {
+        size_t out_size = out.shape().Size();
+        Kernel<set_zero, gpu>::Launch(s, out_size, out.data().dptr<OType>());
+        Kernel<BincountFusedKernel, gpu>::Launch(
+          s, data_n, data.data().dptr<DType>(), weights.data().dptr<OType>(),
+          out.data().dptr<OType>());
+      });
+    });
+}
+
+template<>
+void NumpyBincountForwardImpl<gpu>(const OpContext &ctx,
+                                   const NDArray &data,
+                                   const NDArray &out,
+                                   const size_t &data_n,
+                                   const int &minlength) {
+  using namespace mxnet_op;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+
+  MXNET_NO_FLOAT16_TYPE_SWITCH(data.dtype(), DType, {
+    DType* h_ptr;
+    DType* d_ptr;
+    int bin = minlength;
+    d_ptr = data.data().dptr<DType>();
+    Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(1), s);
+    char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
+    bool is_valid = CheckInvalidInput(s, d_ptr, data_n, is_valid_ptr);
+    CHECK(is_valid) << "Input should be nonnegative number";   // check invalid input
+
+    h_ptr = reinterpret_cast<DType*>(malloc(data_n*sizeof(DType)));
+    CUDA_CALL(cudaMemcpyAsync(h_ptr, d_ptr, data_n*sizeof(DType), cudaMemcpyDeviceToHost,
+                              mshadow::Stream<gpu>::GetStream(s)));
+    CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+    for (size_t i = 0; i < data_n; i++) {
+      if (h_ptr[i] + 1 > bin) bin = h_ptr[i] + 1;
+    }
+    free(h_ptr);
+    mxnet::TShape s(1, bin);
+    const_cast<NDArray &>(out).Init(s);  // set the output shape forcefully
+  });
+
+  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(out.dtype(), OType, {
+      size_t out_size = out.shape().Size();
+      Kernel<set_zero, gpu>::Launch(s, out_size, out.data().dptr<OType>());
+      Kernel<BincountFusedKernel, gpu>::Launch(
+        s, data_n, data.data().dptr<DType>(), out.data().dptr<OType>());
+    });
+  });
+}
+
+NNVM_REGISTER_OP(_npi_bincount)
+.set_attr<FComputeEx>("FComputeEx<gpu>", NumpyBincountForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index a670f794860f..5b5af8b20e36 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -67,6 +67,23 @@ def _add_workload_unravel_index():
     OpArgMngr.add_workload('unravel_index', np.array([],dtype=_np.int64), (10, 3, 5))
     OpArgMngr.add_workload('unravel_index', np.array([3], dtype=_np.int32), (2,2))
 
+def _add_workload_bincount():
+    y = np.arange(4).astype(int)
+    y1 = np.array([1, 5, 2, 4, 1], dtype=_np.int64)
+    y2 = np.array((), dtype=_np.int8)
+    w = np.array([0.2, 0.3, 0.5, 0.1])
+    w1 = np.array([0.2, 0.3, 0.5, 0.1, 0.2])
+
+    OpArgMngr.add_workload('bincount', y)
+    OpArgMngr.add_workload('bincount', y1)
+    OpArgMngr.add_workload('bincount', y, w)
+    OpArgMngr.add_workload('bincount', y1, w1)
+    OpArgMngr.add_workload('bincount', y1, w1, 8)
+    OpArgMngr.add_workload('bincount', y, minlength=3)
+    OpArgMngr.add_workload('bincount', y, minlength=8)
+    OpArgMngr.add_workload('bincount', y2, minlength=0)
+    OpArgMngr.add_workload('bincount', y2, minlength=5)
+    
 
 def _add_workload_diag():
     def get_mat(n):
@@ -1409,6 +1426,7 @@ def _prepare_workloads():
     _add_workload_around()
     _add_workload_argsort()
     _add_workload_append()
+    _add_workload_bincount()
     _add_workload_broadcast_arrays(array_pool)
     _add_workload_broadcast_to()
     _add_workload_clip()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 545466bf0814..b39703b8ebda 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -5670,6 +5670,56 @@ def hybrid_forward(self, F, a):
             assert_almost_equal(elem_mx.asnumpy(), elem_np, rtol=rtol, atol=atol)
 
 
+@with_seed()
+@use_np
+def test_np_bincount():
+    class TestBincount(HybridBlock):
+        def __init__(self, minlength=0):
+            super(TestBincount, self).__init__()
+            self._minlength = minlength
+
+        def hybrid_forward(self, F, a):
+            return F.np.bincount(a, None, self._minlength)
+    
+    class TestBincountWeights(HybridBlock):
+        def __init__(self, minlength=0):
+            super(TestBincountWeights, self).__init__()
+            self._minlength = minlength
+
+        def hybrid_forward(self, F, a, weights):
+            return F.np.bincount(a, weights, self._minlength)
+
+    dtypes = [np.int8, np.uint8, np.int32, np.int64]
+    weight_types = [np.int32, np.int64, np.float16, np.float32, np.float64]
+    shapes = [(), (5,), (10,), (15,), (20,), (30,), (50,)]
+    min_lengths = [0, 5, 20, 50]
+    has_weights = [True, False] 
+    combinations = itertools.product([True, False], shapes, dtypes, weight_types, has_weights, min_lengths)
+    for hybridize, shape, dtype, weight_type, has_weight, minlength in combinations: 
+        rtol = 1e-2 if weight_type == np.float16 else 1e-3
+        atol = 1e-4 if weight_type == np.float16 else 1e-5
+        if shape != ():
+            data = np.random.uniform(0, 10, size=shape).astype(dtype) 
+            weights = np.random.uniform(0, 10, size=shape).astype(weight_type) if has_weight else None
+        else:
+            data = np.array(()).astype(dtype)
+            weights = np.array(()).astype(weight_type) if has_weight else None
+        weights_np = weights.asnumpy() if has_weight else None
+        test_bincount = TestBincountWeights(minlength) if has_weight else TestBincount(minlength)
+        if hybridize:
+            test_bincount.hybridize()
+        mx_out = test_bincount(data, weights) if has_weight else test_bincount(data)
+        np_out = _np.bincount(data.asnumpy(), weights_np, minlength)
+        assert mx_out.shape == np_out.shape
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+        # No backward operation for operator bincount at this moment
+
+        # Test imperative once again
+        mx_out = np.bincount(data, weights, minlength)
+        np_out = _np.bincount(data.asnumpy(), weights_np, minlength)
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+        
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 521c477ad32864d887481abf6c53acae3b717cf6 Mon Sep 17 00:00:00 2001
From: Yiyan66 <57363390+Yiyan66@users.noreply.github.com>
Date: Thu, 19 Dec 2019 14:42:09 +0800
Subject: [PATCH 04/25] [numpy] add op bitwise_not (#16947)

* all

* test op not change

* all

* delete describe

* have output

* add test

* only

* solve conflict
---
 python/mxnet/ndarray/numpy/_op.py             | 110 +++++++++++++++++-
 python/mxnet/numpy/multiarray.py              | 108 ++++++++++++++++-
 python/mxnet/numpy_dispatch_protocol.py       |   2 +
 python/mxnet/symbol/numpy/_symbol.py          |  94 ++++++++++++++-
 src/operator/mshadow_op.h                     |  12 ++
 .../numpy/np_elemwise_unary_op_basic.cc       |  14 +++
 .../numpy/np_elemwise_unary_op_basic.cu       |   3 +
 src/operator/operator_tune.cc                 |   1 +
 src/operator/tensor/elemwise_unary_op.h       |  17 +++
 .../unittest/test_numpy_interoperability.py   |  20 ++++
 tests/python/unittest/test_numpy_op.py        |  66 ++++++++++-
 11 files changed, 437 insertions(+), 10 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 8d56c1f651a3..02e42145fb18 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -28,8 +28,8 @@
 from . import _internal as _npi
 from ..ndarray import NDArray
 
-__all__ = ['shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like',
-           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power',
+__all__ = ['shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like', 'invert',
+           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not',
            'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'sqrt', 'cbrt', 'abs',
            'absolute', 'exp', 'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log', 'degrees', 'log2',
            'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'fix', 'ceil', 'floor', 'histogram',
@@ -2690,6 +2690,112 @@ def floor(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.floor, _np.floor, out=out, **kwargs)
 
 
+@set_module('mxnet.ndarray.numpy')
+@wrap_np_unary_func
+def bitwise_not(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+@wrap_np_unary_func
+def invert(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_unary_func
 def trunc(x, out=None, **kwargs):
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index c3c6f4db8ba0..4910b4d6b925 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -47,8 +47,8 @@
 from ..ndarray.ndarray import _storage_type
 
 __all__ = ['ndarray', 'empty', 'array', 'shape', 'zeros', 'zeros_like', 'ones', 'ones_like', 'full', 'full_like',
-           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power',
-           'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10',
+           'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'bitwise_not',
+           'arctan2', 'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'invert',
            'sqrt', 'cbrt', 'abs', 'absolute', 'exp', 'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'histogram',
            'fix', 'ceil', 'floor', 'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh', 'append', 'argsort',
@@ -3981,6 +3981,110 @@ def floor(x, out=None, **kwargs):
     """
     return _mx_nd_np.floor(x, out=out, **kwargs)
 
+@set_module('mxnet.numpy')
+@wrap_np_unary_func
+def invert(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _mx_nd_np.bitwise_not(x, out=out, **kwargs)
+
+@set_module('mxnet.numpy')
+@wrap_np_unary_func
+def bitwise_not(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _mx_nd_np.bitwise_not(x, out=out, **kwargs)
+
 
 @set_module('mxnet.numpy')
 @wrap_np_unary_func
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index bd5c388a5100..c7e9dd1398eb 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -219,6 +219,8 @@ def _register_array_function():
     'square',
     'cbrt',
     'reciprocal',
+    'invert',
+    'bitwise_not',
     'remainder',
     'sin',
     'cos',
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 0fb0d538082d..6efc333cc16c 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -36,7 +36,7 @@
 except ImportError:
     from builtins import slice as py_slice
 
-__all__ = ['zeros', 'zeros_like', 'ones', 'ones_like', 'full_like',
+__all__ = ['zeros', 'zeros_like', 'ones', 'ones_like', 'full_like', 'bitwise_not', 'invert',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'remainder', 'power', 'arctan2',
            'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'log10', 'sqrt', 'cbrt', 'abs', 'absolute', 'exp',
            'expm1', 'arcsin', 'arccos', 'arctan', 'sign', 'log', 'degrees', 'log2', 'log1p',
@@ -1040,6 +1040,98 @@ def ones(shape, dtype=_np.float32, order='C', ctx=None):
     return _npi.ones(shape=shape, ctx=ctx, dtype=dtype)
 
 
+@set_module('mxnet.symbol.numpy')
+@wrap_np_unary_func
+def invert(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+@wrap_np_unary_func
+def bitwise_not(x, out=None, **kwargs):
+    r"""
+    Compute bit-wise inversion, or bit-wise NOT, element-wise.
+    Computes the bit-wise NOT of the underlying binary representation of
+    the integers in the input arrays. This ufunc implements the C/Python
+    operator ``~``.
+    Parameters
+    ----------
+    x : array_like
+        Only integer and boolean types are handled.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned. A tuple (possible only as a
+        keyword argument) must have length equal to the number of outputs.
+    Returns
+    -------
+    out : ndarray or scalar
+        Result.
+        This is a scalar if `x` is a scalar.
+    See Also
+    --------
+    bitwise_and, bitwise_or, bitwise_xor
+    logical_not
+    binary_repr :
+        Return the binary representation of the input number as a string.
+    Examples
+    --------
+    We've seen that 13 is represented by ``00001101``.
+    The invert or bit-wise NOT of 13 is then:
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
+    >>> np.binary_repr(x, width=8)
+    '11110010'
+    Notes
+    -----
+    `bitwise_not` is an alias for `invert`:
+    >>> np.bitwise_not is np.invert
+    True
+    """
+    return _unary_func_helper(x, _npi.bitwise_not, _np.bitwise_not, out=out, **kwargs)
+
+
 @set_module('mxnet.symbol.numpy')
 def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylint: disable=too-many-arguments
     """
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index cf35e8858039..e3a3c0443428 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -359,6 +359,18 @@ MXNET_UNARY_MATH_OP(negation, -a);
 
 MXNET_UNARY_MATH_OP(reciprocal, 1.0f / math::id(a));
 
+struct bitwise_not : public mxnet_op::tunable {
+  template<typename DType,
+           typename std::enable_if<!std::is_same<DType, bool>::value, int>::type = 0>
+  MSHADOW_XINLINE static DType Map(DType a) {
+    return ~static_cast<int64_t>(a);
+  }
+
+  MSHADOW_XINLINE static bool Map(bool a) {
+    return !a;
+  }
+};
+
 MXNET_UNARY_MATH_OP(reciprocal_grad, -1.0f / math::sqr(a));
 
 MXNET_UNARY_MATH_OP(sigmoid, 1.0f / (1.0f + math::exp(-a)));
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index cad736aab65b..5e15d7ad4e67 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -169,6 +169,20 @@ Example::
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
+// bitwise_not
+NNVM_REGISTER_OP(_npi_bitwise_not)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+     return std::vector<std::string>{"x"};
+})
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::ComputeInt<cpu, mshadow_op::bitwise_not>)
+.add_argument("x", "NDArray-or-Symbol", "The input array.")
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
 // trunc
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_trunc, "x", mshadow_op::trunc)
 .describe(R"code(Return the truncated value of the input, element-wise.
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index af8834f01664..517ef9c2b52a 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -53,6 +53,9 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_ceil, mshadow_op::ceil);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_floor, mshadow_op::floor);
 
+NNVM_REGISTER_OP(_npi_bitwise_not)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeInt<gpu, mshadow_op::bitwise_not>);
+
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_trunc, mshadow_op::trunc);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_fix, mshadow_op::fix);
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index db898f8840f0..7ca594d21e59 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -311,6 +311,7 @@ IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::radians);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::radians_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::nt);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::np_logical_not);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::bitwise_not);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::nt);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::clip);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::clip);  // NOLINT()
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 05e10ffa4e16..4486b0dcd712 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -252,6 +252,23 @@ class UnaryOp : public OpBase {
     });
   }
 
+  template<typename xpu, typename OP>
+  static void ComputeInt(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    MXNET_INT_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        if (inputs[0].Size() != 0) {
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        }
+      });
+    });
+  }
+
   template<typename xpu, typename OP>
   static void ComputeLogic(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx,
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 5b5af8b20e36..fcdf547bfbec 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -1242,6 +1242,24 @@ def _add_workload_logical_not(array_pool):
     OpArgMngr.add_workload('logical_not', np.array([True, False, True, False], dtype=np.bool))
 
 
+def _add_workload_bitwise_not():
+    OpArgMngr.add_workload('bitwise_not', np.array([True, False, True, False], dtype=np.bool))
+    for dtype in [np.int8, np.int32, np.int64]:
+        zeros = np.array([0], dtype=dtype)
+        ones = np.array([-1], dtype=dtype)
+        OpArgMngr.add_workload('bitwise_not', zeros)
+        OpArgMngr.add_workload('bitwise_not', ones)
+
+
+def _add_workload_invert():
+    OpArgMngr.add_workload('invert', np.array([True, False, True, False], dtype=np.bool))
+    for dtype in [np.int8, np.int32, np.int64]:
+        zeros = np.array([0], dtype=dtype)
+        ones = np.array([-1], dtype=dtype)
+        OpArgMngr.add_workload('invert', zeros)
+        OpArgMngr.add_workload('invert', ones)
+
+
 def _add_workload_vdot():
     OpArgMngr.add_workload('vdot', np.random.normal(size=(2, 4)), np.random.normal(size=(4, 2)))
     OpArgMngr.add_workload('vdot', np.random.normal(size=(2, 4)).astype(np.float64), np.random.normal(size=(2, 4)).astype(np.float64))
@@ -1526,6 +1544,8 @@ def _prepare_workloads():
     _add_workload_turnc(array_pool)
     _add_workload_floor(array_pool)
     _add_workload_logical_not(array_pool)
+    _add_workload_bitwise_not()
+    _add_workload_invert()
     _add_workload_vdot()
     _add_workload_vstack(array_pool)
     _add_workload_column_stack()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index b39703b8ebda..af9228d45991 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1778,6 +1778,66 @@ def hybrid_forward(self, F, a, *args, **kwargs):
             check_unary_func(func, ref_grad, shape, low, high)
 
 
+@with_seed()
+@use_np
+def test_np_bitwise_not():
+    def check_unary_func(func, ref_grad, shape, low, high):
+        class TestUnary(HybridBlock):
+            def __init__(self, func):
+                super(TestUnary, self).__init__()
+                self._func = func
+
+            def hybrid_forward(self, F, a, *args, **kwargs):
+                return getattr(F.np, self._func)(a)
+
+        np_func = getattr(_np, func)
+        mx_func = TestUnary(func)
+        np_test_data = _np.random.uniform(low, high, shape).astype(_np.int32)
+        mx_test_data = mx.numpy.array(np_test_data)
+        for hybridize in [True, False]:
+            if hybridize:
+                mx_func.hybridize()
+            if ref_grad:
+                mx_test_data.attach_grad()
+            np_out = np_func(np_test_data)
+            with mx.autograd.record():
+                y = mx_func(mx_test_data)
+            assert y.shape == np_out.shape
+            assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+            if np_out.dtype == np.bool_:
+                assert y.dtype == np.bool_
+
+            if ref_grad:
+                y.backward()
+                assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-1, atol=1e-2, equal_nan=True)
+
+        np_out = getattr(_np, func)(np_test_data)
+        mx_out = getattr(mx.np, func)(mx_test_data)
+        assert mx_out.shape == np_out.shape
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data, where=False)
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  subok=False)
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  dtype=_np.int8)
+        assertRaises(TypeError, getattr(np, func), mx_test_data,  dtype="abcdefg")
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  casting='safe')
+        assertRaises(TypeError, getattr(np, func), mx_test_data,  casting='mxnet')
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='C')
+        assertRaises(NotImplementedError, getattr(np, func), mx_test_data,  order='mxnet')
+
+    funcs = {
+        'bitwise_not' : (None, -5, 5),
+        'invert' : (None, -5, 5),
+    }
+    ndim = random.choice([2, 3, 4])
+    shape = random.choice([rand_shape_nd(ndim, dim=3), (1, 0, 2)])
+    for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
+        for func, func_data in funcs.items():
+            ref_grad, low, high = func_data
+            check_unary_func(func, ref_grad, shape, low, high)
+
+
 @with_seed()
 @use_np
 def test_np_binary_funcs():
@@ -3760,11 +3820,7 @@ def get_grad_b(A, X):
     nrhs = (-1, 0, 1, 2, 3)
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, nrh in itertools.product([False, True], shapes, dtypes, nrhs):
-        rtol = 1e-3
-        atol = 1e-5
-        if dtype == 'float32':
-            rtol = 1e-2
-            atol = 1e-4
+        rtol, atol =1e-2, 1e-4
         test_solve = TestSolve()
         if hybridize:
             test_solve.hybridize()

From 8a3519934f3ee5e9ac9406c2a4edb377af5e8cc7 Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Fri, 20 Dec 2019 08:34:36 -0800
Subject: [PATCH 05/25] Improve the speed of the pointwise fusion graph pass
 (#17114)

* Debug the long startup time

* Optimize backward fusion

* Figure out why the fusion pass is called twice

* Cleaning

* Small optimization
---
 src/executor/simple_partition_pass.h | 98 ++++++++++++++++++----------
 src/imperative/cached_op.cc          | 22 ++++---
 2 files changed, 76 insertions(+), 44 deletions(-)

diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h
index 5b26a4523c13..ea1dcf39b8ba 100644
--- a/src/executor/simple_partition_pass.h
+++ b/src/executor/simple_partition_pass.h
@@ -102,8 +102,7 @@ class BidirectionalGraph {
   std::vector<std::unordered_set<Node*>> get_subsets(FCompatible is_compatible) {
     std::vector<std::unordered_set<Node*>> subgraphs;
     std::unordered_set<Node*> incomp_set;
-    std::unordered_set<Node*> all_set(nodes.size());
-    std::vector<PairSet> separation_sets;
+    std::vector<std::pair<bool, PairSet>> separation_sets;
     // Check each node for compatibility
     // and, if it is incompatible, mark nodes
     // on each side of it as not possible to be
@@ -111,48 +110,79 @@ class BidirectionalGraph {
     for (Node& node : nodes) {
       if (!is_compatible(node.nnvmptr)) {
         incomp_set.insert(&node);
-        std::unordered_set<Node*> in_graph;
-        std::unordered_set<Node*> out_graph;
-        std::vector<Node*> dummy_head;
-        dummy_head.emplace_back(&node);
-        DFS(dummy_head, false, [&out_graph, &is_compatible](Node* node) {
-          if (is_compatible(node->nnvmptr))
-            out_graph.insert(node);
-        });
-        DFS(dummy_head, true, [&in_graph, is_compatible](Node* node) {
-          if (is_compatible(node->nnvmptr))
-            in_graph.insert(node);
-        });
-        if (!(in_graph.empty() || out_graph.empty()))
-          separation_sets.push_back(std::make_pair(in_graph, out_graph));
       }
-      all_set.emplace(&node);
     }
-    IncompMap incomp_map;
-    std::unordered_set<Node*> comp_set;
-    comp_set.insert(all_set.begin(), all_set.end());
-    for (Node* n : incomp_set) {
-      comp_set.erase(n);
+    for (Node& node : nodes) {
+      if (incomp_set.count(&node) != 0) {
+        // Check if all your inputs are incompatible too.
+        // If so, then your separation set does not matter,
+        // because it will covered by the sets of your inputs
+        bool inside_node = true;
+        for (Node* input : node.inputs) {
+          if (incomp_set.count(input) == 0) {
+            inside_node = false;
+          }
+        }
+        if (!inside_node) {
+          std::unordered_set<Node*> in_graph;
+          std::unordered_set<Node*> out_graph;
+          std::vector<Node*> dummy_head;
+          dummy_head.emplace_back(&node);
+          DFS(dummy_head, false, [&out_graph](Node* node) {
+              out_graph.insert(node);
+          });
+          DFS(dummy_head, true, [&in_graph](Node* node) {
+              in_graph.insert(node);
+          });
+            separation_sets.push_back(std::make_pair(true,
+                                                     std::make_pair(in_graph, out_graph)));
+        } else {
+          separation_sets.push_back(std::make_pair(false, PairSet()));
+        }
+      } else {
+        separation_sets.push_back(std::make_pair(false, PairSet()));
+      }
     }
+    IncompMap incomp_map;
     // For each node construct the map of nodes that cannot be in
     // the same subset
-    for (Node* n : comp_set) {
-      for (PairSet p : separation_sets) {
-        if (p.first.count(n)) {
-          incomp_map[n].insert(p.second.begin(), p.second.end());
-        } else if (p.second.count(n)) {
-          incomp_map[n].insert(p.first.begin(), p.first.end());
+    index_t num_nodes = nodes.size();
+    for (index_t i = 0; i < num_nodes; ++i) {
+      const auto n = &(nodes[i]);
+      if (incomp_set.count(n) == 0) {
+        for (index_t j = i + 1; j < num_nodes; ++j) {
+          const auto& sep_set_pair = separation_sets[j];
+          if (sep_set_pair.first && incomp_map[n].count(&nodes[j]) == 0) {
+            const auto& p = sep_set_pair.second;
+            if (p.first.count(n)) {
+              incomp_map[n].insert(p.second.begin(), p.second.end());
+            } else if (p.second.count(n)) {
+              incomp_map[n].insert(p.first.begin(), p.first.end());
+            }
+          }
+        }
+        for (index_t j = i - 1; j >= 0; --j) {
+          const auto& sep_set_pair = separation_sets[j];
+          if (sep_set_pair.first && incomp_map[n].count(&nodes[j]) == 0) {
+            const auto& p = sep_set_pair.second;
+            if (p.first.count(n)) {
+              incomp_map[n].insert(p.second.begin(), p.second.end());
+            } else if (p.second.count(n)) {
+              incomp_map[n].insert(p.first.begin(), p.first.end());
+            }
+          }
+        }
+        for (Node* incomp_n : incomp_set) {
+          incomp_map[n].erase(incomp_n);
         }
-      }
-      for (Node* incomp_n : incomp_set) {
-        incomp_map[n].erase(incomp_n);
       }
     }
     std::unordered_set<Node*> unused_set;
-    unused_set.reserve(comp_set.size());
 
-    for (auto& n : comp_set) {
-      unused_set.insert(n);
+    for (auto& n : nodes) {
+      if (incomp_set.count(&n) == 0) {
+        unused_set.insert(&n);
+      }
     }
     std::unordered_set<Node*> visited;
     std::deque<Node*> stack(outputs.begin(), outputs.end());
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index ec5a79a2e675..1edd9897ec82 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -1032,17 +1032,19 @@ OpStatePtr CachedOp::Forward(
   CHECK_EQ(inputs.size(), num_inputs());
 
   Context default_ctx = inputs[0]->ctx();
-  auto state_ptr = GetCachedOpState(default_ctx);
-  auto& state = state_ptr.get_state<CachedOpState>();
+  {
+    auto state_ptr = GetCachedOpState(default_ctx);
+    auto& state = state_ptr.get_state<CachedOpState>();
 
-  const auto& idx = state.info.fwd_graph.indexed_graph();
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i]->ctx(), default_ctx)
-        << "CachedOp requires all inputs to live on the same context. But "
-        << idx[idx.input_nodes()[0]].source->attrs.name
-        << " is on " << default_ctx << " while "
-        << idx[idx.input_nodes()[i]].source->attrs.name
-        << " is on " << inputs[i]->ctx();
+    const auto& idx = state.info.fwd_graph.indexed_graph();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      CHECK_EQ(inputs[i]->ctx(), default_ctx)
+          << "CachedOp requires all inputs to live on the same context. But "
+          << idx[idx.input_nodes()[0]].source->attrs.name
+          << " is on " << default_ctx << " while "
+          << idx[idx.input_nodes()[i]].source->attrs.name
+          << " is on " << inputs[i]->ctx();
+    }
   }
 
   int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size);

From 615f609da748a771901478cf37e37064763b0d8c Mon Sep 17 00:00:00 2001
From: liuzh91 <liuzhuanghua1991@gmail.com>
Date: Sat, 21 Dec 2019 08:31:40 +0800
Subject: [PATCH 06/25] fix parameter names in the estimator api (#17051)

---
 .../contrib/estimator/batch_processor.py      |  4 +--
 .../gluon/contrib/estimator/estimator.py      | 30 ++++++++---------
 .../unittest/test_gluon_batch_processor.py    |  4 +--
 tests/python/unittest/test_gluon_estimator.py | 32 +++++++++----------
 4 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/python/mxnet/gluon/contrib/estimator/batch_processor.py b/python/mxnet/gluon/contrib/estimator/batch_processor.py
index 4985f8c81bf3..aa5adbfdea5f 100644
--- a/python/mxnet/gluon/contrib/estimator/batch_processor.py
+++ b/python/mxnet/gluon/contrib/estimator/batch_processor.py
@@ -61,8 +61,8 @@ def evaluate_batch(self, estimator,
             Batch axis to split the validation data into devices.
         """
         data, label = self._get_data_and_label(val_batch, estimator.context, batch_axis)
-        pred = [estimator.eval_net(x) for x in data]
-        loss = [estimator.evaluation_loss(y_hat, y) for y_hat, y in zip(pred, label)]
+        pred = [estimator.val_net(x) for x in data]
+        loss = [estimator.val_loss(y_hat, y) for y_hat, y in zip(pred, label)]
 
         return data, label, pred, loss
 
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
index 09f43151e235..ed8a53d7c3a6 100644
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -61,22 +61,19 @@ class Estimator(object):
         Trainer to apply optimizer on network parameters.
     context : Context or list of Context
         Device(s) to run the training on.
-    evaluation_loss : gluon.loss.loss
-        Loss (objective) function to calculate during validation. If set evaluation_loss
-        None, it will use the same loss function as self.loss
-    eval_net : gluon.Block
+    val_net : gluon.Block
         The model used for validation. The validation model does not necessarily belong to
         the same model class as the training model. But the two models typically share the
         same architecture. Therefore the validation model can reuse parameters of the
         training model.
 
-        The code example of consruction of eval_net sharing the same network parameters as
+        The code example of consruction of val_net sharing the same network parameters as
         the training net is given below:
 
         >>> net = _get_train_network()
-        >>> eval_net = _get_test_network(params=net.collect_params())
+        >>> val_net = _get_test_network(params=net.collect_params())
         >>> net.initialize(ctx=ctx)
-        >>> est = Estimator(net, loss, eval_net=eval_net)
+        >>> est = Estimator(net, loss, val_net=val_net)
 
         Proper namespace match is required for weight sharing between two networks. Most networks
         inheriting :py:class:`Block` can share their parameters correctly. An exception is
@@ -84,6 +81,9 @@ class Estimator(object):
         the  naming in mxnet Gluon API, please refer to the site
         (https://mxnet.apache.org/api/python/docs/tutorials/packages/gluon/blocks/naming.html)
         for future information.
+    val_loss : gluon.loss.loss
+        Loss (objective) function to calculate during validation. If set val_loss
+        None, it will use the same loss function as self.loss
     batch_processor: BatchProcessor
         BatchProcessor provides customized fit_batch() and evaluate_batch() methods
     """
@@ -113,8 +113,8 @@ def __init__(self, net,
                  initializer=None,
                  trainer=None,
                  context=None,
-                 evaluation_loss=None,
-                 eval_net=None,
+                 val_net=None,
+                 val_loss=None,
                  batch_processor=None):
         self.net = net
         self.loss = self._check_loss(loss)
@@ -122,12 +122,12 @@ def __init__(self, net,
         self._val_metrics = _check_metrics(val_metrics)
         self._add_default_training_metrics()
         self._add_validation_metrics()
-        self.evaluation_loss = self.loss
-        if evaluation_loss is not None:
-            self.evaluation_loss = self._check_loss(evaluation_loss)
-        self.eval_net = self.net
-        if eval_net is not None:
-            self.eval_net = eval_net
+        self.val_loss = self.loss
+        if val_loss is not None:
+            self.val_loss = self._check_loss(val_loss)
+        self.val_net = self.net
+        if val_net is not None:
+            self.val_net = val_net
 
         self.logger = logging.Logger(name='Estimator', level=logging.INFO)
         self.logger.addHandler(logging.StreamHandler(sys.stdout))
diff --git a/tests/python/unittest/test_gluon_batch_processor.py b/tests/python/unittest/test_gluon_batch_processor.py
index 4bd6f769aa44..8604713fc129 100644
--- a/tests/python/unittest/test_gluon_batch_processor.py
+++ b/tests/python/unittest/test_gluon_batch_processor.py
@@ -84,7 +84,7 @@ def test_batch_processor_validation():
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.metric.Accuracy()
-    evaluation_loss = gluon.loss.L1Loss()
+    val_loss = gluon.loss.L1Loss()
     net.initialize(ctx=ctx)
     processor = BatchProcessor()
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
@@ -93,7 +93,7 @@ def test_batch_processor_validation():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
+                    val_loss=val_loss,
                     batch_processor=processor)
     # Input dataloader
     est.fit(train_data=dataloader,
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
index 924dd083bef4..ca61e4b40caa 100644
--- a/tests/python/unittest/test_gluon_estimator.py
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -88,7 +88,7 @@ def test_validation():
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.metric.Accuracy()
-    evaluation_loss = gluon.loss.L1Loss()
+    val_loss = gluon.loss.L1Loss()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
@@ -96,7 +96,7 @@ def test_validation():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss)
+                    val_loss=val_loss)
     # Input dataloader
     est.fit(train_data=dataloader,
             val_data=dataloader,
@@ -376,16 +376,16 @@ def test_default_handlers():
     assert isinstance(handlers[1], MetricHandler)
     assert isinstance(handlers[4], LoggingHandler)
 
-def test_eval_net():
-    ''' test estimator with a different evaluation net '''
+def test_val_net():
+    ''' test estimator with different training and validation networks '''
     ''' test weight sharing of sequential networks without namescope '''
     net = _get_test_network()
-    eval_net = _get_test_network(params=net.collect_params())
+    val_net = _get_test_network(params=net.collect_params())
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
-    evaluation_loss = gluon.loss.L2Loss()
+    val_loss = gluon.loss.L2Loss()
     acc = mx.metric.Accuracy()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
@@ -394,8 +394,8 @@ def test_eval_net():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
-                    eval_net=eval_net)
+                    val_loss=val_loss,
+                    val_net=val_net)
 
     with assert_raises(RuntimeError):
         est.fit(train_data=dataloader,
@@ -404,7 +404,7 @@ def test_eval_net():
 
     ''' test weight sharing of sequential networks with namescope '''
     net = _get_test_network_with_namescope()
-    eval_net = _get_test_network_with_namescope(params=net.collect_params())
+    val_net = _get_test_network_with_namescope(params=net.collect_params())
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
@@ -412,8 +412,8 @@ def test_eval_net():
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
-                    eval_net=eval_net)
+                    val_loss=val_loss,
+                    val_net=val_net)
 
     est.fit(train_data=dataloader,
             val_data=dataloader,
@@ -422,20 +422,20 @@ def test_eval_net():
     ''' test weight sharing of two resnets '''
     net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
     net.output = gluon.nn.Dense(10)
-    eval_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
-    eval_net.output = gluon.nn.Dense(10, params=net.collect_params())
+    val_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
+    val_net.output = gluon.nn.Dense(10, params=net.collect_params())
     dataset = gluon.data.ArrayDataset(mx.nd.zeros((10, 3, 224, 224)), mx.nd.zeros((10, 10)))
     dataloader = gluon.data.DataLoader(dataset=dataset, batch_size=5)
     net.initialize(ctx=ctx)
-    eval_net.initialize(ctx=ctx)
+    val_net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
                     context=ctx,
-                    evaluation_loss=evaluation_loss,
-                    eval_net=eval_net)
+                    val_loss=val_loss,
+                    val_net=val_net)
 
     est.fit(train_data=dataloader,
             val_data=dataloader,

From d000c3baa32171964f1b8ed3780472af0e05be1a Mon Sep 17 00:00:00 2001
From: Xi Wang <xidulu@gmail.com>
Date: Sat, 21 Dec 2019 10:31:27 +0800
Subject: [PATCH 07/25] [Numpy ]Modify np.random.shuffle to enable inplace by
 default (#17133)

* shuffle done

* fix dodstring
---
 python/mxnet/ndarray/numpy/random.py | 38 +++++++++++++++++++++++++++-
 python/mxnet/numpy/random.py         | 38 +++++++++++++++++++++++++++-
 python/mxnet/symbol/numpy/random.py  | 38 +++++++++++++++++++++++++++-
 src/operator/random/shuffle_op.cc    |  2 +-
 4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 9d1a6f9119ee..e67c766c6bdf 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -23,7 +23,7 @@
 from ..ndarray import NDArray
 
 
-__all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial"]
+__all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial", "shuffle"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -344,3 +344,39 @@ def rand(*size, **kwargs):
     for s in size:
         output_shape += (s,)
     return uniform(0, 1, size=output_shape, **kwargs)
+
+
+def shuffle(x):
+    """
+    Modify a sequence in-place by shuffling its contents.
+
+    This function only shuffles the array along the first axis of a
+    multi-dimensional array. The order of sub-arrays is changed but
+    their contents remain the same.
+
+    Parameters
+    ----------
+    x: ndarray
+        The array or list to be shuffled.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> arr = np.arange(10)
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([5., 1., 0., 6., 7., 3., 9., 8., 4., 2.])  # random
+
+    Multi-dimensional arrays are only shuffled along the first axis:
+
+    >>> arr = np.arange(9).reshape((3, 3))
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([[6., 7., 8.], # random
+           [3., 4., 5.],
+           [0., 1., 2.]])
+    """
+    _npi.shuffle(x, out=x)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index 1cad4a55c466..ebc24de63282 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import
 from ..ndarray import numpy as _mx_nd_np
 
-__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial"]
+__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "shuffle"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -321,3 +321,39 @@ def rand(*size, **kwargs):
     for s in size:
         output_shape += (s,)
     return _mx_nd_np.random.uniform(0, 1, size=output_shape, **kwargs)
+
+
+def shuffle(x):
+    """
+    Modify a sequence in-place by shuffling its contents.
+
+    This function only shuffles the array along the first axis of a
+    multi-dimensional array. The order of sub-arrays is changed but
+    their contents remain the same.
+
+    Parameters
+    ----------
+    x: ndarray
+        The array or list to be shuffled.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> arr = np.arange(10)
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([5., 1., 0., 6., 7., 3., 9., 8., 4., 2.])  # random
+
+    Multi-dimensional arrays are only shuffled along the first axis:
+
+    >>> arr = np.arange(9).reshape((3, 3))
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([[6., 7., 8.], # random
+           [3., 4., 5.],
+           [0., 1., 2.]])
+    """
+    _mx_nd_np.random.shuffle(x)
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
index 48bccb64a2b4..94c29f407acc 100644
--- a/python/mxnet/symbol/numpy/random.py
+++ b/python/mxnet/symbol/numpy/random.py
@@ -21,7 +21,7 @@
 from ...context import current_context
 from . import _internal as _npi
 
-__all__ = ['randint', 'uniform', 'normal', 'rand']
+__all__ = ['randint', 'uniform', 'normal', 'rand', 'shuffle']
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -288,3 +288,39 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
             return _npi.choice(a=a, size=size, replace=replace, ctx=ctx, weighted=False, out=out)
         else:
             return _npi.choice(p, a=a, size=size, replace=replace, ctx=ctx, weighted=True, out=out)
+
+
+def shuffle(x):
+    """
+    Modify a sequence in-place by shuffling its contents.
+
+    This function only shuffles the array along the first axis of a
+    multi-dimensional array. The order of sub-arrays is changed but
+    their contents remain the same.
+
+    Parameters
+    ----------
+    x: _Symbol
+        The array or list to be shuffled.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> arr = np.arange(10)
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([5., 1., 0., 6., 7., 3., 9., 8., 4., 2.])  # random
+
+    Multi-dimensional arrays are only shuffled along the first axis:
+
+    >>> arr = np.arange(9).reshape((3, 3))
+    >>> np.random.shuffle(arr)
+    >>> arr
+    array([[6., 7., 8.], # random
+           [3., 4., 5.],
+           [0., 1., 2.]])
+    """
+    _npi.shuffle(x, out=x)
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 86797c136bab..0f64fbc51449 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -122,7 +122,7 @@ void ShuffleForwardCPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_shuffle)
 .add_alias("shuffle")
-.add_alias("_np__random_shuffle")
+.add_alias("_npi_shuffle")
 .describe(R"code(Randomly shuffle the elements.
 
 This shuffles the array along the first axis.

From 5aa3a7a5251f7ed9276fa898158957da0538df4f Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 23 Dec 2019 12:31:13 +0800
Subject: [PATCH 08/25] Disable OpenMP offloading support for 3rdparty/openmp
 (#17098)

* Disable OpenMP offloading support for 3rdparty/openmp

OpenMP offloading was introduced some time during the past two years and is
enabled by default. With upgrading 3rdparty/openmp in
https://github.com/apache/incubator-mxnet/pull/17012
it was made part of the MXNet CMake build.

But we don't use OpenMP offloading and the Cuda target in the llvm OpenMP
Offloading build is broken in our setting.

* Update CMake on CI
---
 CMakeLists.txt                           |  3 +-
 ci/build_windows.py                      | 48 +++++++++++++++---------
 ci/docker/Dockerfile.build.android_armv7 |  2 +-
 ci/docker/Dockerfile.build.android_armv8 |  4 +-
 ci/docker/Dockerfile.build.armv6         |  2 +-
 ci/docker/Dockerfile.build.armv7         |  2 +-
 ci/docker/Dockerfile.build.armv8         |  2 +-
 ci/docker/Dockerfile.build.jetson        |  2 +-
 ci/docker/install/ubuntu_core.sh         |  9 ++---
 ci/util.py                               | 33 +++++++++++++++-
 10 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fcbe707aa9ef..f0779abd06d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.0.2)
+cmake_minimum_required(VERSION 3.13)
 
 # workaround to store CMAKE_CROSSCOMPILING because is getting reset by the project command
 if(CMAKE_CROSSCOMPILING)
@@ -452,6 +452,7 @@ if(USE_OPENMP)
     set(OPENMP_STANDALONE_BUILD TRUE)
     set(LIBOMP_ENABLE_SHARED TRUE)
     set(CMAKE_BUILD_TYPE Release)
+    set(OPENMP_ENABLE_LIBOMPTARGET OFF CACHE BOOL "LLVM OpenMP offloading support")  # Requires CMP0077 CMake 3.13
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp)
   endfunction()
 
diff --git a/ci/build_windows.py b/ci/build_windows.py
index ce77c316ab20..5839e8d793d1 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -1,4 +1,4 @@
-﻿#!/usr/bin/env python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -28,7 +28,9 @@
 import platform
 import shutil
 import sys
+import tempfile
 import time
+import zipfile
 from distutils.dir_util import copy_tree
 from enum import Enum
 from subprocess import check_call
@@ -147,22 +149,33 @@ def windows_build(args):
     mxnet_root = get_mxnet_root()
     logging.info("Found MXNet root: {}".format(mxnet_root))
 
-    with remember_cwd():
-        os.chdir(path)
-        cmd = "\"{}\" && cmake -G \"NMake Makefiles JOM\" {} {}".format(args.vcvars,
-                                                                        CMAKE_FLAGS[args.flavour],
-                                                                        mxnet_root)
-        logging.info("Generating project with CMake:\n{}".format(cmd))
-        check_call(cmd, shell=True)
-
-        cmd = "\"{}\" && jom".format(args.vcvars)
-        logging.info("Building with jom:\n{}".format(cmd))
-
-        t0 = int(time.time())
-        check_call(cmd, shell=True)
-
-        logging.info("Build flavour: {} complete in directory: \"{}\"".format(args.flavour, os.path.abspath(path)))
-        logging.info("Build took {}".format(datetime.timedelta(seconds=int(time.time() - t0))))
+    url = 'https://github.com/Kitware/CMake/releases/download/v3.16.1/cmake-3.16.1-win64-x64.zip'
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cmake_file_path = download_file(url, tmpdir)
+        with zipfile.ZipFile(cmake_file_path, 'r') as zip_ref:
+            # Create $tmpdir\cmake-3.16.1-win64-x64\bin\cmake.exe
+            zip_ref.extractall(tmpdir)
+
+        with remember_cwd():
+            os.chdir(path)
+            cmd = "\"{}\" && {} -G \"NMake Makefiles JOM\" {} {}".format(
+                args.vcvars,
+                os.path.join(tmpdir, 'cmake-3.16.1-win64-x64', 'bin', 'cmake.exe'),
+                CMAKE_FLAGS[args.flavour], mxnet_root)
+            logging.info("Generating project with CMake:\n{}".format(cmd))
+            check_call(cmd, shell=True)
+
+            cmd = "\"{}\" && jom".format(args.vcvars)
+            logging.info("Building with jom:\n{}".format(cmd))
+
+            t0 = int(time.time())
+            check_call(cmd, shell=True)
+
+            logging.info(
+                "Build flavour: {} complete in directory: \"{}\"".format(
+                    args.flavour, os.path.abspath(path)))
+            logging.info("Build took {}".format(
+                datetime.timedelta(seconds=int(time.time() - t0))))
     windows_package(args)
 
 
@@ -262,4 +275,3 @@ def main():
 
 if __name__ == '__main__':
     sys.exit(main())
-
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index a2e98cd2efe1..2c923a015b63 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM mxnetcipinned/dockcross-base:11262018
+FROM dockcross/base
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
 # The cross-compiling emulator
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index f7de86763457..ca62288129bb 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
-FROM mxnetcipinned/dockcross-base:11262018
+FROM dockcross/base
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
 RUN apt-get update && apt-get install -y \
@@ -82,4 +82,4 @@ RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
 
-WORKDIR /work/build
\ No newline at end of file
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 60e223b7a60f..e6a7ffe758b9 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
-FROM mxnetcipinned/dockcross-linux-armv6:11262018
+FROM dockcross/linux-armv6
 
 ENV ARCH armv6l
 ENV HOSTCC gcc
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 0b557d5839e9..bad9ab214050 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM mxnetcipinned/dockcross-linux-armv7:11262018
+FROM dockcross/linux-armv7
 
 ENV ARCH armv7l
 ENV HOSTCC gcc
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index ef9c95865590..bd2373180f0b 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM mxnetcipinned/dockcross-linux-arm64:11262018
+FROM dockcross/linux-arm64
 
 ENV ARCH aarch64
 ENV HOSTCC gcc
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 07097887f87d..e31ee43a93d8 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -22,7 +22,7 @@
 
 FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
 
-FROM mxnetcipinned/dockcross-linux-arm64:11262018
+FROM dockcross/linux-arm64
 
 ENV ARCH aarch64
 ENV HOSTCC gcc
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 8426b539f4c4..bd5d1f6fdf6f 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -57,11 +57,10 @@ apt-get install -y \
 ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
 
 
-# Note: we specify an exact cmake version to work around a cmake 3.10 CUDA 10 issue.
-# Reference: https://github.com/clab/dynet/issues/1457
+# CMake 3.13.2+ is required
 mkdir /opt/cmake && cd /opt/cmake
-wget -nv https://cmake.org/files/v3.12/cmake-3.12.4-Linux-x86_64.sh
-sh cmake-3.12.4-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
+wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh
+sh cmake-3.13.5-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
 ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
-rm cmake-3.12.4-Linux-x86_64.sh
+rm cmake-3.13.5-Linux-x86_64.sh
 cmake --version
diff --git a/ci/util.py b/ci/util.py
index 4b3a399184f9..cd5665d04df8 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -15,12 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import contextlib
 import logging
 import logging.config
+import os
+import subprocess
 import sys
 
+import requests
+
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -130,3 +133,31 @@ def config_logging():
     # or sensitive information
     logging.getLogger("botocore").setLevel(logging.WARNING)
     logging.getLogger("requests").setLevel(logging.WARNING)
+
+
+# Takes url and downloads it to the dest_path directory on Windows.
+def download_file(url, dest_path):
+    file_name = url.split('/')[-1]
+    full_path = "{}\\{}".format(dest_path, file_name)
+    logging.info("Downloading: {}".format(full_path))
+    r = requests.get(url, stream=True)
+    if r.status_code == 404:
+        return r.status_code
+    elif r.status_code != 200:
+        logging.error("{} returned status code {}".format(url, r.status_code))
+    with open(full_path, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk: # filter out keep-alive new chunks
+                f.write(chunk)
+    return full_path
+
+
+# Takes arguments and runs command on host.  Shell is disabled by default.
+def run_command(args, shell=False):
+    try:
+        logging.info("Issuing command: {}".format(args))
+        res = subprocess.check_output(args, shell=shell, timeout=1800).decode("utf-8").replace("\r\n", "")
+        logging.info("Output: {}".format(res))
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return res

From efc4ad838a5193856d79ab3f70537af89e4ebffb Mon Sep 17 00:00:00 2001
From: Iblis Lin <iblis@hs.ntnu.edu.tw>
Date: Tue, 24 Dec 2019 09:14:39 +0800
Subject: [PATCH 09/25] [MXNET-1440] julia: porting `current_context` (#17142)

* julia: porting `current_context`

- And introduce new macros for changing default context
  `@context`, `@gpu`, `@cpu`
---
 julia/src/MXNet.jl             |   6 +-
 julia/src/context.jl           | 104 ++++++++++++++++++++++++++++++++-
 julia/src/ndarray/array.jl     |  18 +++---
 julia/src/ndarray/type.jl      |   2 +-
 julia/test/unittest/context.jl |  77 ++++++++++++++++++++++++
 julia/test/unittest/ndarray.jl |   2 +-
 6 files changed, 197 insertions(+), 12 deletions(-)

diff --git a/julia/src/MXNet.jl b/julia/src/MXNet.jl
index 86d7d06806fc..4c50d7321158 100644
--- a/julia/src/MXNet.jl
+++ b/julia/src/MXNet.jl
@@ -80,7 +80,11 @@ export Context,
        cpu,
        gpu,
        num_gpus,
-       gpu_memory_info
+       gpu_memory_info,
+       current_context,
+       @context,
+       @cpu,
+       @gpu
 
 # model.jl
 export AbstractModel,
diff --git a/julia/src/context.jl b/julia/src/context.jl
index 68e69138e10e..bb80e0728724 100644
--- a/julia/src/context.jl
+++ b/julia/src/context.jl
@@ -31,11 +31,89 @@ struct Context
   Context(dev_type::CONTEXT_TYPE, dev_id::Integer = 0) = new(dev_type, dev_id)
 end
 
+const _default_ctx = Ref{Context}(Context(CPU, 0))
+
 Context(dev_type::Integer, dev_id::Integer = 0) =
   Context(convert(CONTEXT_TYPE, dev_type), dev_id)
 
 Base.show(io::IO, ctx::Context) =
-  print(io, "$(ctx.device_type)$(ctx.device_id)")
+  print(io, lowercase("$(ctx.device_type)$(ctx.device_id)"))
+
+function _with_context(dev_type::Union{Symbol,Expr}, dev_id, e::Expr)
+  global _default_ctx
+  quote
+    ctx = current_context()
+    ctx′ = Context($(esc(dev_type)), $(esc(dev_id)))
+    $_default_ctx[] = ctx′
+    try
+      return $(esc(e))
+    finally
+      $_default_ctx[] = ctx
+    end
+  end
+end
+
+"""
+    @context device_type [device_id] expr
+
+Change the default context in the following expression.
+
+# Examples
+```jl-repl
+julia> mx.@context mx.GPU begin
+         mx.zeros(2, 3)
+       end
+2×3 NDArray{Float32,2} @ gpu0:
+ 0.0f0  0.0f0  0.0f0
+ 0.0f0  0.0f0  0.0f0
+
+julia> @context mx.GPU mx.zeros(3, 2)
+3×2 NDArray{Float32,2} @ gpu0:
+ 0.0f0  0.0f0
+ 0.0f0  0.0f0
+ 0.0f0  0.0f0
+```
+"""
+macro context(dev_type, e::Expr)
+  _with_context(dev_type, 0, e)
+end
+
+macro context(dev_type, dev_id, e::Expr)
+  _with_context(dev_type, dev_id, e)
+end
+
+for dev ∈ [:cpu, :gpu]
+  ctx = QuoteNode(Symbol(uppercase(string(dev))))
+  docstring = """
+        @$dev [device_id] expr
+
+    A shorthand for `@context mx.GPU`.
+
+    # Examples
+    ```jl-repl
+    julia> mx.@with_gpu mx.zeros(2, 3)
+    2×3 NDArray{Float32,2} @ gpu0:
+     0.0f0  0.0f0  0.0f0
+     0.0f0  0.0f0  0.0f0
+    ```
+    """
+  @eval begin
+    @doc $docstring ->
+    macro $dev(e::Expr)
+      ctx = $ctx
+      quote
+        @context $ctx $(esc(e))
+      end
+    end
+
+    macro $dev(dev_id, e::Expr)
+      ctx = $ctx
+      quote
+        @context $ctx $(esc(dev_id)) $(esc(e))
+      end
+    end
+  end
+end  # for dev ∈ [:cpu, :gpu]
 
 """
     cpu(dev_id)
@@ -86,3 +164,27 @@ function gpu_memory_info(dev_id = 0)
   @mxcall :MXGetGPUMemoryInformation64 (Cint, Ref{UInt64}, Ref{UInt64}) dev_id free n
   free[], n[]
 end
+
+"""
+    current_context()
+
+Return the current context.
+
+By default, `mx.cpu()` is used for all the computations
+and it can be overridden by using the `@context` macro.
+
+# Examples
+```jl-repl
+julia> mx.current_context()
+cpu0
+
+julia> mx.@context mx.GPU 1 begin  # Context changed in the following code block
+         mx.current_context()
+       end
+gpu1
+
+julia> mx.current_context()
+cpu0
+```
+"""
+current_context() = _default_ctx[]
diff --git a/julia/src/ndarray/array.jl b/julia/src/ndarray/array.jl
index b71e5ddf9397..2cd9c2e24f9c 100644
--- a/julia/src/ndarray/array.jl
+++ b/julia/src/ndarray/array.jl
@@ -28,13 +28,14 @@ Base.similar(x::NDArray{T,N}; writable = x.writable, ctx = context(x)) where {T,
   NDArray{T,N}(undef, size(x)...; writable = writable, ctx = ctx)
 
 """
-    zeros([DType], dims, [ctx::Context = cpu()])
+    zeros([DType], dims, ctx::Context = current_context())
     zeros([DType], dims...)
     zeros(x::NDArray)
 
 Create zero-ed `NDArray` with specific shape and type.
 """
-function zeros(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+function zeros(::Type{T}, dims::NTuple{N,Int},
+               ctx::Context = current_context()) where {N,T<:DType}
   x = NDArray{T}(undef, dims..., ctx = ctx)
   x[:] = zero(T)
   x
@@ -42,7 +43,7 @@ end
 
 zeros(::Type{T}, dims::Int...) where {T<:DType} = zeros(T, dims)
 
-zeros(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+zeros(dims::NTuple{N,Int}, ctx::Context = current_context()) where N =
   zeros(MX_float, dims, ctx)
 zeros(dims::Int...) = zeros(dims)
 
@@ -50,13 +51,14 @@ zeros(x::NDArray)::typeof(x)      = zeros_like(x)
 Base.zeros(x::NDArray)::typeof(x) = zeros_like(x)
 
 """
-    ones([DType], dims, [ctx::Context = cpu()])
+    ones([DType], dims, ctx::Context = current_context())
     ones([DType], dims...)
     ones(x::NDArray)
 
 Create an `NDArray` with specific shape & type, and initialize with 1.
 """
-function ones(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+function ones(::Type{T}, dims::NTuple{N,Int},
+              ctx::Context = current_context()) where {N,T<:DType}
   arr = NDArray{T}(undef, dims..., ctx = ctx)
   arr[:] = one(T)
   arr
@@ -64,7 +66,7 @@ end
 
 ones(::Type{T}, dims::Int...) where T<:DType = ones(T, dims)
 
-ones(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+ones(dims::NTuple{N,Int}, ctx::Context = current_context()) where N =
   ones(MX_float, dims, ctx)
 ones(dims::Int...) = ones(dims)
 
@@ -458,12 +460,12 @@ function Base.fill!(arr::NDArray, x)
 end
 
 """
-    fill(x, dims, ctx=cpu())
+    fill(x, dims, ctx = current_context())
     fill(x, dims...)
 
 Create an `NDArray` filled with the value `x`, like `Base.fill`.
 """
-function fill(x::T, dims::NTuple{N,Integer}, ctx::Context = cpu()) where {T,N}
+function fill(x::T, dims::NTuple{N,Integer}, ctx::Context = current_context()) where {T,N}
   arr = NDArray{T}(undef, dims, ctx = ctx)
   arr[:] = x
   arr
diff --git a/julia/src/ndarray/type.jl b/julia/src/ndarray/type.jl
index 8d90d63f0360..e24c89291dcb 100644
--- a/julia/src/ndarray/type.jl
+++ b/julia/src/ndarray/type.jl
@@ -116,7 +116,7 @@ end
 
 # UndefInitializer constructors
 NDArray{T,N}(::UndefInitializer, dims::NTuple{N,Integer};
-             writable = true, ctx::Context = cpu()) where {T,N} =
+             writable = true, ctx::Context = current_context()) where {T,N} =
   NDArray{T,N}(_ndarray_alloc(T, dims, ctx, false), writable)
 NDArray{T,N}(::UndefInitializer, dims::Vararg{Integer,N}; kw...) where {T,N} =
   NDArray{T,N}(undef, dims; kw...)
diff --git a/julia/test/unittest/context.jl b/julia/test/unittest/context.jl
index 0a8f086a194a..e903f9212930 100644
--- a/julia/test/unittest/context.jl
+++ b/julia/test/unittest/context.jl
@@ -26,8 +26,85 @@ function test_num_gpus()
   @test num_gpus() >= 0
 end
 
+function test_context_macro()
+  @info "Context::@context"
+
+  @context mx.CPU 42 begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 42
+
+    @context mx.GPU 24 begin
+      ctx = mx.current_context()
+      @test ctx.device_type == mx.GPU
+      @test ctx.device_id   == 24
+    end
+
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 42
+  end
+
+  function f()
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.GPU
+    @test ctx.device_id   == 123
+  end
+
+  @context mx.GPU 123 begin
+    f()
+  end
+
+  @context mx.GPU begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.GPU
+    @test ctx.device_id   == 0
+  end
+
+  @context mx.CPU begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 0
+  end
+
+  @info "Context::@gpu"
+  @gpu 123 f()
+  @gpu begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.GPU
+    @test ctx.device_id   == 0
+  end
+  let n = 321
+    @gpu n begin
+      ctx = mx.current_context()
+      @test ctx.device_type == mx.GPU
+      @test ctx.device_id   == 321
+    end
+  end
+
+  @info "Context::@cpu"
+  @cpu 123 begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 123
+  end
+  @cpu begin
+    ctx = mx.current_context()
+    @test ctx.device_type == mx.CPU
+    @test ctx.device_id   == 0
+  end
+  let n = 321
+    @cpu n begin
+      ctx = mx.current_context()
+      @test ctx.device_type == mx.CPU
+      @test ctx.device_id   == 321
+    end
+  end
+end
+
 @testset "Context Test" begin
   test_num_gpus()
+  test_context_macro()
 end
 
 
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index 599b0a65bfc4..fb59b71edd60 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -1294,7 +1294,7 @@ function test_show()
     @test occursin("1×4", str)
     @test occursin("NDArray", str)
     @test occursin("Int64", str)
-    @test occursin("CPU", str)
+    @test occursin("cpu", str)
     @test match(r"1\s+2\s+3\s+4", str) != nothing
   end
 

From 318d9c7f8f5434bdff37ddae6bfd7b03b7e1dede Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Wed, 25 Dec 2019 16:40:11 +0800
Subject: [PATCH 10/25] Fix reshape interoperability test (#17155)

* fix reshape interoperability test

* fix for scipy import
---
 ci/docker/install/requirements                       | 4 ++--
 tests/python/unittest/test_metric.py                 | 5 +++--
 tests/python/unittest/test_numpy_interoperability.py | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
index cbfc521e2c08..fd716f5fa815 100644
--- a/ci/docker/install/requirements
+++ b/ci/docker/install/requirements
@@ -26,8 +26,8 @@ h5py==2.8.0rc1
 mock==2.0.0
 nose==1.3.7
 nose-timer==0.7.3
-numpy>1.16.0,<2.0.0
+numpy>1.16.0,<1.18.0
 pylint==2.3.1; python_version >= '3.0'
 requests<2.19.0,>=2.18.4
-scipy==1.0.1
+scipy==1.2.1
 six==1.11.0
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index a1e5128d8ac6..e7273fba35d5 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -18,6 +18,7 @@
 import mxnet as mx
 import numpy as np
 import scipy
+from scipy.stats import pearsonr
 import json
 import math
 from common import with_seed
@@ -267,7 +268,7 @@ def test_pearsonr():
     pred1 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
     label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
     pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
-    pearsonr_expected_scipy, _ = scipy.stats.pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
+    pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
     macro_pr = mx.metric.create('pearsonr', average='macro')
     micro_pr = mx.metric.create('pearsonr', average='micro')
 
@@ -289,7 +290,7 @@ def test_pearsonr():
     label12 = mx.nd.array([[1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1]])
 
     pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1]
-    pearsonr_expected_scipy, _ = scipy.stats.pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
+    pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
 
     macro_pr.reset()
     micro_pr.update([label2], [pred2])
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index fcdf547bfbec..9b445044a3c1 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -758,7 +758,7 @@ def _add_workload_reshape():
     # OpArgMngr.add_workload('reshape', b, (2, 2), order='F')  # Items are not equal with order='F'
 
     a = np.array(_np.ones((0, 2)))
-    OpArgMngr.add_workload('reshape', a, -1, 2)
+    OpArgMngr.add_workload('reshape', a, (-1, 2))
 
 
 def _add_workload_rint(array_pool):

From 410165b0a5f903edf500786d4c1b973e389c7b57 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Wed, 25 Dec 2019 13:18:32 -0800
Subject: [PATCH 11/25] [CD] enable s3 publish for nightly builds in cd
 (#17112)

* enable s3 publish for nightly builds in cd

* pass credential through env

* confine credential variables to subprocess
---
 cd/python/pypi/Jenkins_pipeline.groovy |  3 ++-
 cd/python/pypi/pypi_publish.py         | 21 ++++++++++-----------
 ci/docker/runtime_functions.sh         |  9 +++++++++
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy
index e9f172a570fe..fa9300db3ca0 100644
--- a/cd/python/pypi/Jenkins_pipeline.groovy
+++ b/cd/python/pypi/Jenkins_pipeline.groovy
@@ -27,7 +27,7 @@
 // This is a temporary solution until we are confident with the packages generated by CI
 // This should be removed in the not too distant future.
 // We only skip the publish step so we can still QA the other variants.
-pypi_releases = ["cu92", "cu92mkl"]
+pypi_releases = []
 
 def get_pipeline(mxnet_variant) {
   def node_type = mxnet_variant.startsWith('cu') ? NODE_LINUX_GPU : NODE_LINUX_CPU
@@ -72,6 +72,7 @@ def push(mxnet_variant) {
     } else {
       echo "Temporarily skipping publishing PyPI package for '${mxnet_variant}'."
     }
+    sh "./ci/docker/runtime_functions.sh cd_s3_publish"
   }
 }
 
diff --git a/cd/python/pypi/pypi_publish.py b/cd/python/pypi/pypi_publish.py
index 7e09f644c734..2729068dd503 100755
--- a/cd/python/pypi/pypi_publish.py
+++ b/cd/python/pypi/pypi_publish.py
@@ -35,10 +35,8 @@ def post_wheel(path):
     logging.info('Posting {} to PyPI'.format(path))
     pypi_credentials = get_secret()
 
-    cmd = 'python3 -m twine upload --username {} --password {} {}'.format(
-        pypi_credentials['username'],
-        pypi_credentials['password'],
-        path)
+    cmd = 'python3 -m twine upload {}'.format(path)
+    version = os.path.basename(path).split('-')[1]
 
     # The PyPI credentials for DEV has username set to 'skipPublish'
     # This way we do not attempt to publish the PyPI package
@@ -47,14 +45,15 @@ def post_wheel(path):
         print('In DEV account, skipping publish')
         print('Would have run: {}'.format(cmd))
         return 0
-    else:
+    elif any(test_version_mark in version for test_version_mark in ['a', 'b', 'dev']):
         print('Skipping publishing nightly builds to Pypi.')
         print('See https://github.com/pypa/pypi-support/issues/50 for details')
         return 0
-
-        # DO NOT PRINT CMD IN THIS BLOCK, includes password
-        p = subprocess.run(cmd.split(' '),
-                        stdout=subprocess.PIPE)
+    else:
+        env = os.environ.copy()
+        env['TWINE_USERNAME'] = pypi_credentials['username']
+        env['TWINE_PASSWORD'] = pypi_credentials['password']
+        p = subprocess.run(cmd.split(' '), stdout=subprocess.PIPE, env=env)
         logging.info(p.stdout)
         return p.returncode
 
@@ -85,7 +84,7 @@ def get_secret():
             raise e
     else:
         return json.loads(get_secret_value_response['SecretString'])
-        
-            
+
+
 if __name__ == '__main__':
     sys.exit(post_wheel(sys.argv[1]))
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b658f953a78a..e078b2a8f89c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -2065,6 +2065,15 @@ cd_pypi_publish() {
     ./cd/python/pypi/pypi_publish.py `readlink -f wheel_build/dist/*.whl`
 }
 
+cd_s3_publish() {
+    set -ex
+    pip3 install --user awscli
+    filepath=$(readlink -f wheel_build/dist/*.whl)
+    filename=$(basename $file_path)
+    variant=$(echo $filename | cut -d'-' -f1 | cut -d'_' -f2 -s)
+    aws s3 cp --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers,full=id=43f628fab72838a4f0b929d7f1993b14411f4b0294b011261bc6bd3e950a6822 s3://apache-mxnet/dist/${variant}/${filename}
+}
+
 build_static_scala_mkl() {
     set -ex
     pushd .

From 2551a9d8c8a4f5fd73c98e56ff79ab5410053d0e Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Thu, 26 Dec 2019 07:07:30 +0800
Subject: [PATCH 12/25] fix norm sparse fallback (#17149)

---
 src/operator/tensor/broadcast_reduce_norm_value.cc | 2 +-
 src/operator/tensor/broadcast_reduce_norm_value.cu | 2 +-
 src/operator/tensor/broadcast_reduce_op.h          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cc b/src/operator/tensor/broadcast_reduce_norm_value.cc
index 4cd92d44997e..9acc157f8eca 100644
--- a/src/operator/tensor/broadcast_reduce_norm_value.cc
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cc
@@ -40,7 +40,7 @@ void L2NormComputeEx<cpu>(const nnvm::NodeAttrs& attrs,
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
   if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
        param.ord == 2) {
     // l2 norm on the entire array
diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cu b/src/operator/tensor/broadcast_reduce_norm_value.cu
index 188c93e61221..735c3d7faec9 100644
--- a/src/operator/tensor/broadcast_reduce_norm_value.cu
+++ b/src/operator/tensor/broadcast_reduce_norm_value.cu
@@ -39,7 +39,7 @@ void L2NormComputeEx<gpu>(const nnvm::NodeAttrs& attrs,
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   const NDArrayStorageType istype = inputs[0].storage_type();
-  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+  const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
   if ((istype == kRowSparseStorage || istype == kCSRStorage) && axis.ndim() == 0 &&
        param.ord == 2) {
     // l2 norm on the entire array
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 27e22491ca35..799f86544160 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -1152,7 +1152,7 @@ inline bool LpNormStorageType(const nnvm::NodeAttrs& attrs,
                                      DispatchMode::kFCompute);
   }
   if (param.ord == 2) {
-    const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape();
+    const mxnet::TShape axis = param.axis.has_value() ? param.axis.value() : mxnet::TShape(0, -1);
     if (!dispatched && (in_stype == kRowSparseStorage || in_stype == kCSRStorage) &&
         axis.ndim() == 0 && param.ord == 2) {
       // l2 norm: rsp/csr, axis = () -> dns

From 872b533c45a627c79e8be9800bdcadebd77b28af Mon Sep 17 00:00:00 2001
From: Xi Wang <xidulu@gmail.com>
Date: Thu, 26 Dec 2019 10:56:21 +0800
Subject: [PATCH 13/25] randn implemented (#17141)

---
 python/mxnet/numpy/random.py           | 43 +++++++++++++++++++++++++-
 tests/python/unittest/test_numpy_op.py | 22 +++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index ebc24de63282..95719a005cec 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import
 from ..ndarray import numpy as _mx_nd_np
 
-__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "shuffle"]
+__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "shuffle", "randn"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
@@ -357,3 +357,44 @@ def shuffle(x):
            [0., 1., 2.]])
     """
     _mx_nd_np.random.shuffle(x)
+
+
+def randn(*size, **kwargs):
+    r"""Return a sample (or samples) from the "standard normal" distribution.
+    If positive, int_like or int-convertible arguments are provided,
+    `randn` generates an array of shape ``(d0, d1, ..., dn)``, filled
+    with random floats sampled from a univariate "normal" (Gaussian)
+    distribution of mean 0 and variance 1 (if any of the :math:`d_i` are
+    floats, they are first converted to integers by truncation). A single
+    float randomly sampled from the distribution is returned if no
+    argument is provided.
+    This is a convenience function.  If you want an interface that takes a
+    tuple as the first argument, use `numpy.random.standard_normal` instead.
+    Parameters
+    ----------
+    d0, d1, ..., dn : int, optional
+        The dimensions of the returned array, should be all positive.
+        If no argument is given a single Python float is returned.
+    Returns
+    -------
+    Z : ndarray
+        A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from
+        the standard normal distribution, or a single such float if
+        no parameters were supplied.
+    Notes
+    -----
+    For random samples from :math:`N(\mu, \sigma^2)`, use:
+    ``sigma * np.random.randn(...) + mu``
+    Examples
+    --------
+    >>> np.random.randn()
+    2.1923875335537315 #random
+    Two-by-four array of samples from N(3, 6.25):
+    >>> 2.5 * np.random.randn(2, 4) + 3
+    array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677],  #random
+        [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]]) #random
+    """
+    output_shape = ()
+    for s in size:
+        output_shape += (s,)
+    return _mx_nd_np.random.normal(0, 1, size=output_shape, **kwargs)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index af9228d45991..4bbf9b8040e2 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3089,6 +3089,28 @@ def hybrid_forward(self, F, x):
                 assert out.shape == expected_shape
 
 
+@with_seed()
+@use_np
+def test_np_randn():
+    # Test shapes.
+    shapes = [
+        (3, 3),
+        (3, 4),
+        (0, 0),
+        (3, 3, 3),
+        (0, 0, 0),
+        (2, 2, 4, 3),
+        (2, 2, 4, 3),
+        (2, 0, 3, 0),
+        (2, 0, 2, 3)
+    ]
+    dtypes = ['float16', 'float32', 'float64']
+    for dtype in dtypes:
+        for shape in shapes:
+            data_mx = np.random.randn(*shape, dtype=dtype)
+            assert data_mx.shape == shape
+
+
 @with_seed()
 @use_np
 def test_random_seed():

From 8f9ae1c883acec8c9e12b149fa44d10737a39de1 Mon Sep 17 00:00:00 2001
From: Tao Lv <tao.a.lv@intel.com>
Date: Thu, 26 Dec 2019 13:20:00 +0800
Subject: [PATCH 14/25] update mkldnn to v1.1.2 (#17165)

---
 3rdparty/mkldnn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 52c3052df8ec..cb2cc7ac17ff 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 52c3052df8ec1d5b8b45cb6c350a952840eabd42
+Subproject commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d

From e15c778258f78eacb75865aff9b7fe0b75d6291f Mon Sep 17 00:00:00 2001
From: Yiyan66 <57363390+Yiyan66@users.noreply.github.com>
Date: Thu, 26 Dec 2019 14:59:37 +0800
Subject: [PATCH 15/25] [numpy] fix argsort typo (#17150)

* return

* fix symbol
---
 python/mxnet/numpy/multiarray.py     | 2 +-
 python/mxnet/symbol/numpy/_symbol.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 4910b4d6b925..a1b0e016445a 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -1377,7 +1377,7 @@ def argsort(self, axis=-1, kind=None, order=None):  # pylint: disable=arguments-
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise argsort(self, axis=axis, kind=kind, order=order)
+        return argsort(self, axis=axis, kind=kind, order=order)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 6efc333cc16c..3ee385660715 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -491,7 +491,7 @@ def argsort(self, axis=-1, kind=None, order=None):  # pylint: disable=arguments-
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise argsort(self, axis=axis, kind=kind, order=order)
+        return argsort(self, axis=axis, kind=kind, order=order)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.

From 07913f9beb07cb130900567bfe86e8305b84fd3c Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Thu, 26 Dec 2019 15:30:35 +0800
Subject: [PATCH 16/25] fix broken link (#17130)

---
 example/quantization/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/quantization/README.md b/example/quantization/README.md
index 8cdc1bb7e06f..b934a811f31d 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -9,7 +9,7 @@ This folder contains examples of quantizing a FP32 model with Intel® MKL-DNN or
 
 <h2 id="1">Model Quantization with Intel® MKL-DNN</h2>
 
-Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/tutorials/mkldnn/mkldnn_quantization.html).
+Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_quantization.html).
 
 ```
 usage: imagenet_gen_qsym_mkldnn.py [-h] [--model MODEL] [--epoch EPOCH]

From d26dd15a1b074b0fef3ef43e660679c0b696887a Mon Sep 17 00:00:00 2001
From: Yiyan66 <57363390+Yiyan66@users.noreply.github.com>
Date: Thu, 26 Dec 2019 17:14:01 +0800
Subject: [PATCH 17/25] [numpy] add op round (#17175)

* add round

* sanity

* space
---
 python/mxnet/ndarray/numpy/_op.py             | 27 +++++++++++++---
 python/mxnet/numpy/multiarray.py              | 23 ++++++++++---
 python/mxnet/numpy_dispatch_protocol.py       |  1 +
 python/mxnet/symbol/numpy/_symbol.py          | 30 +++++++++++++----
 .../unittest/test_numpy_interoperability.py   |  5 +++
 tests/python/unittest/test_numpy_op.py        | 32 +++++++++++++++++++
 6 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 02e42145fb18..e380b4937168 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -37,10 +37,10 @@
            'logspace', 'expand_dims', 'tile', 'arange', 'array_split', 'split', 'vsplit', 'concatenate', 'append',
            'stack', 'vstack', 'column_stack', 'dstack', 'average', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip',
            'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index', 'hanning', 'hamming',
-           'blackman', 'flip', 'around', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
-           'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
-           'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
+           'blackman', 'flip', 'around', 'round', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad',
+           'unique', 'lcm', 'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal',
+           'greater', 'less', 'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero',
+           'shares_memory', 'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -4737,6 +4737,25 @@ def around(x, decimals=0, out=None, **kwargs):
         raise TypeError('type {} not supported'.format(str(type(x))))
 
 
+@set_module('mxnet.ndarray.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    from ...numpy import ndarray
+    if isinstance(x, numeric_types):
+        return _np.around(x, decimals, **kwargs)
+    elif isinstance(x, ndarray):
+        return _npi.around(x, decimals, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index a1b0e016445a..22094a1621d2 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -55,9 +55,9 @@
            'tensordot', 'eye', 'linspace', 'logspace', 'expand_dims', 'tile', 'arange', 'array_split',
            'split', 'vsplit', 'concatenate', 'stack', 'vstack', 'column_stack', 'dstack', 'average', 'mean',
            'maximum', 'minimum', 'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign',
-           'ravel', 'unravel_index', 'hanning', 'hamming', 'blackman', 'flip', 'around', 'arctan2', 'hypot',
-           'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity', 'take',
-           'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal',
+           'ravel', 'unravel_index', 'hanning', 'hamming', 'blackman', 'flip', 'around', 'round', 'arctan2',
+           'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity',
+           'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal',
            'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero', 'shares_memory',
            'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
@@ -1558,13 +1558,13 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
 
-    def round(self, *args, **kwargs):
+    def round(self, decimals=0, out=None, **kwargs): # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`round`.
 
         The arguments are the same as for :py:func:`round`, with
         this array as data.
         """
-        raise NotImplementedError
+        return round(self, decimals=decimals, out=out, **kwargs)
 
     def rint(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rint`.
@@ -6456,6 +6456,19 @@ def around(x, decimals=0, out=None, **kwargs):
     return _mx_nd_np.around(x, decimals, out=out, **kwargs)
 
 
+@set_module('mxnet.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    return _mx_nd_np.around(x, decimals, out=out, **kwargs)
+
+
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index c7e9dd1398eb..9aa755fb436e 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -86,6 +86,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'argmin',
     'argmax',
     'around',
+    'round',
     'argsort',
     'append',
     'broadcast_arrays',
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 3ee385660715..0b341b804758 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -45,10 +45,10 @@
            'logspace', 'expand_dims', 'tile', 'arange', 'array_split', 'split', 'vsplit', 'concatenate', 'append',
            'stack', 'vstack', 'column_stack', 'dstack', 'average', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip',
            'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index', 'hanning', 'hamming',
-           'blackman', 'flip', 'around', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
-           'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
-           'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'shares_memory',
-           'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
+           'blackman', 'flip', 'around', 'round', 'hypot', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad',
+           'unique', 'lcm', 'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal',
+           'greater', 'less', 'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide',
+           'shares_memory', 'may_share_memory', 'diff', 'resize', 'nan_to_num', 'where', 'bincount']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -665,13 +665,13 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute norm')
 
-    def round(self, *args, **kwargs):
+    def round(self, decimals=0, out=None, **kwargs): # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`round`.
 
         The arguments are the same as for :py:func:`round`, with
         this array as data.
         """
-        raise NotImplementedError
+        return round(self, decimals=decimals, out=out, **kwargs)
 
     def rint(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rint`.
@@ -4524,6 +4524,24 @@ def around(x, decimals=0, out=None, **kwargs):
         raise TypeError('type {} not supported'.format(str(type(x))))
 
 
+@set_module('mxnet.symbol.numpy')
+def round(x, decimals=0, out=None, **kwargs):
+    r"""
+    round_(a, decimals=0, out=None)
+    Round an array to the given number of decimals.
+
+    See Also
+    --------
+    around : equivalent function; see for details.
+    """
+    if isinstance(x, numeric_types):
+        return _np.around(x, decimals, **kwargs)
+    elif isinstance(x, _Symbol):
+        return _npi.around(x, decimals, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
 @set_module('mxnet.symbol.numpy')
 @wrap_np_binary_func
 def arctan2(x1, x2, out=None, **kwargs):
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 9b445044a3c1..3d26ee28b22e 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -594,6 +594,10 @@ def _add_workload_around():
     OpArgMngr.add_workload('around', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
 
 
+def _add_workload_round():
+    OpArgMngr.add_workload('round', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
+
+
 def _add_workload_argsort():
     for dtype in [np.int32, np.float32]:
         a = np.arange(101, dtype=dtype)
@@ -1442,6 +1446,7 @@ def _prepare_workloads():
     _add_workload_argmin()
     _add_workload_argmax()
     _add_workload_around()
+    _add_workload_round()
     _add_workload_argsort()
     _add_workload_append()
     _add_workload_bincount()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 4bbf9b8040e2..3f9f1d6677cc 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -4435,6 +4435,38 @@ def hybrid_forward(self, F, x):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
 
 
+@with_seed()
+@use_np
+def test_np_round():
+    class TestRound(HybridBlock):
+        def __init__(self, decimals):
+            super(TestRound, self).__init__()
+            self.decimals = decimals
+
+        def hybrid_forward(self, F, x):
+            return F.np.round(x, self.decimals)
+
+    shapes = [(), (1, 2, 3), (1, 0)]
+    types = ['int32', 'int64', 'float32', 'float64']
+    for hybridize in [True, False]:
+        for oneType in types:
+            rtol, atol = 1e-3, 1e-5
+            for shape in shapes:
+                for d in range(-5, 6):
+                    test_round = TestRound(d)
+                    if hybridize:
+                        test_round.hybridize()
+                    x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
+                    np_out = _np.round(x.asnumpy(), d)
+                    mx_out = test_round(x)
+                    assert mx_out.shape == np_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+                    mx_out = np.round(x, d)
+                    np_out = _np.round(x.asnumpy(), d)
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+
 @with_seed()
 @use_np
 def test_np_nonzero():

From cf81887a99cea3abebbf1c9728e926a0da9f0f1b Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Thu, 26 Dec 2019 18:53:19 +0800
Subject: [PATCH 18/25] Quantized Elemwise Mul Operator (#17147)

* add elt-wise mul xinyu

* fuse mul dequantize

* change to use subgraph

* address comments and add tests

* fix ut

* improve ut

* skip pragma omp simd for msvc

* fix lint

* fix clang error
---
 .../quantization/quantized_elemwise_mul-inl.h |  64 +++++
 .../quantization/quantized_elemwise_mul.cc    | 267 ++++++++++++++++++
 ...kldnn_elemwisemul_post_quantize_property.h | 222 +++++++++++++++
 .../mkldnn/mkldnn_subgraph_property.cc        |   2 +
 .../python/quantization/test_quantization.py  |  62 +++-
 5 files changed, 616 insertions(+), 1 deletion(-)
 create mode 100644 src/operator/quantization/quantized_elemwise_mul-inl.h
 create mode 100644 src/operator/quantization/quantized_elemwise_mul.cc
 create mode 100644 src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h

diff --git a/src/operator/quantization/quantized_elemwise_mul-inl.h b/src/operator/quantization/quantized_elemwise_mul-inl.h
new file mode 100644
index 000000000000..f58db8a45eea
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_mul-inl.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+
+#include "../tensor/elemwise_binary_op-inl.h"
+
+namespace mxnet {
+namespace op {
+/* These structure is used for requantization only when fusion */
+struct QuantizeElemwiseMulParam : public dmlc::Parameter<QuantizeElemwiseMulParam> {
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  bool enable_float_output;
+  DMLC_DECLARE_PARAMETER(QuantizeElemwiseMulParam) {
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
+    .describe("Whether to enable float32 output");
+  }
+};
+
+namespace quantized_elemwise_mul {
+enum QuantizedElemwiseMulOpInputs {kLhs, kRhs, kLhsMin, kLhsMax, kRhsMin, kRhsMax};
+enum QuantizedElemwiseMulOpOutputs {kOut, kOutMin, kOutMax};
+enum QuantizedElemwiseMulOpResource {kTempSpace};
+}  // namespace quantized_elemwise_mul
+
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
diff --git a/src/operator/quantization/quantized_elemwise_mul.cc b/src/operator/quantization/quantized_elemwise_mul.cc
new file mode 100644
index 000000000000..a752c14837a6
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_mul.cc
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#include <mxnet/op_attr_types.h>
+#include "../tensor/elemwise_binary_op-inl.h"
+#include "./quantized_elemwise_mul-inl.h"
+#include "./quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(QuantizeElemwiseMulParam);
+
+static std::vector<std::string> QuantizedElemwiseMulOutputNames(const NodeAttrs &attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  if (params.enable_float_output)
+    return std::vector<std::string>{"output"};
+  else
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+}
+
+inline bool QuantizedElemwiseMulOpShape(const nnvm::NodeAttrs& attrs,
+                                        mxnet::ShapeVector *in_attrs,
+                                        mxnet::ShapeVector *out_attrs) {
+  using namespace mshadow;
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  const mxnet::TShape &lshape = (*in_attrs)[quantized_elemwise_mul::kLhs];
+  const mxnet::TShape &rshape = (*in_attrs)[quantized_elemwise_mul::kRhs];
+  if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
+  CHECK_EQ(lshape.ndim(), rshape.ndim())
+    << "Currently, quantized elemwise multiply doesn't support broadcast.";
+  for (int i = 0; i < lshape.ndim(); ++i) {
+    CHECK_EQ(lshape[i], rshape[i]);
+  }
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMax, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMax, mxnet::TShape(1, 1));
+
+  out_attrs->clear();
+  SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOut, lshape);
+  if (!params.enable_float_output) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMin, mxnet::TShape(1, 1));
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMax, mxnet::TShape(1, 1));
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpType(const nnvm::NodeAttrs& attrs,
+                                       std::vector<int> *in_type,
+                                       std::vector<int> *out_type) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  for (int i = 0; i < 2; ++i) {
+    if (in_type->at(i) == mshadow::kInt8) {
+      TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+    } else {
+      LOG(ERROR) << "currently, quantized elemwise mul only support int8 inputs.";
+    }
+  }
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMax, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMax, mshadow::kFloat32);
+
+  int dtype = mshadow::kInt32;
+  if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+    dtype = mshadow::kInt8;
+  }
+  if (!params.enable_float_output) {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, dtype);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMin, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMax, mshadow::kFloat32);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, mshadow::kFloat32);
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpStorageType(const nnvm::NodeAttrs& attrs,
+                                              int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  using namespace common;
+  *dispatch_mode = DispatchMode::kFCompute;
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void QuantizedElemwiseMulOpForward(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<TBlob> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<TBlob> &outputs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  using namespace mxnet_op;
+
+  float lhs_min = inputs[quantized_elemwise_mul::kLhsMin].dptr<float>()[0];
+  float lhs_max = inputs[quantized_elemwise_mul::kLhsMax].dptr<float>()[0];
+  float rhs_min = inputs[quantized_elemwise_mul::kRhsMin].dptr<float>()[0];
+  float rhs_max = inputs[quantized_elemwise_mul::kRhsMax].dptr<float>()[0];
+
+  float cached_output_min_ = 0.f;
+  float cached_output_max_ = 0.f;
+  float out_data_scale = 1.f;
+  float out_scale = 1.f;
+  if (!params.enable_float_output) {
+    float output_data_range = kInt32Range;
+    // dataA && dataB are int8
+    if (outputs[quantized_elemwise_mul::kOut].type_flag_ == mshadow::kInt8) {
+      output_data_range = kInt8Range;
+    } else {
+      output_data_range = kInt32Range;
+    }
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      cached_output_min_ = params.min_calib_range.value();
+      cached_output_max_ = params.max_calib_range.value();
+      out_data_scale = output_data_range / MaxAbs(cached_output_min_, cached_output_max_);
+      auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+      auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+      out_scale = out_data_scale / lhs_scale / rhs_scale;
+    } else {
+      Stream<cpu> *s = ctx.get_stream<cpu>();
+      if (inputs[quantized_elemwise_mul::kLhs].type_flag_ == mshadow::kInt8 &&
+          inputs[quantized_elemwise_mul::kRhs].type_flag_ == mshadow::kInt8) {
+        mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
+            s, 1, &cached_output_min_, &cached_output_max_, &lhs_min, &lhs_max, &rhs_min, &rhs_max);
+      } else {
+        LOG(ERROR) << "lhs and rhs only support iny8 dtype.";
+      }
+    }
+  } else {
+    auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+    auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+    out_scale = 1.0 / lhs_scale / rhs_scale;
+  }
+
+  size_t out_size = outputs[quantized_elemwise_mul::kOut].Size();
+  auto *input_l = inputs[quantized_elemwise_mul::kLhs].dptr<int8_t>();
+  auto *input_r = inputs[quantized_elemwise_mul::kRhs].dptr<int8_t>();
+  // TODO(Xinyu): a temp solution to enable Elemwise INT8 computation,
+  // will be refactored after the DNNL primitive is done.
+  if (!params.enable_float_output) {
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      typedef int8_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    } else {
+      typedef int32_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    }
+  } else {
+    typedef float_t out_type;
+    auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+    for (size_t i = 0; i < out_size; ++i) {
+      const int8_t a = input_l[i];
+      const int8_t b = input_r[i];
+      out_data[i] = static_cast<out_type>(a * b * out_scale);
+    }
+  }
+
+  if (!params.enable_float_output) {
+    outputs[quantized_elemwise_mul::kOutMin].dptr<float>()[0] = cached_output_min_;
+    outputs[quantized_elemwise_mul::kOutMax].dptr<float>()[0] = cached_output_max_;
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_mul)
+.describe(R"code(Multiplies arguments int8 element-wise.
+)code" ADD_FILELINE)
+.set_num_inputs(6)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  return (!params.enable_float_output) ? 3 : 1;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs", "lhs_min", "lhs_max", "rhs_min", "rhs_max"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", QuantizedElemwiseMulOutputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedElemwiseMulOpShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedElemwiseMulOpType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedElemwiseMulOpStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", QuantizedElemwiseMulOpForward)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("lhs", "NDArray-or-Symbol", "first input")
+.add_argument("rhs", "NDArray-or-Symbol", "second input")
+.add_argument("lhs_min", "NDArray-or-Symbol", "Minimum value of first input.")
+.add_argument("lhs_max", "NDArray-or-Symbol", "Maximum value of first input.")
+.add_argument("rhs_min", "NDArray-or-Symbol", "Minimum value of second input.")
+.add_argument("rhs_max", "NDArray-or-Symbol", "Maximum value of second input.")
+.set_attr_parser(ParamParser<QuantizeElemwiseMulParam>)
+.add_arguments(QuantizeElemwiseMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(elemwise_mul)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_contrib_quantized_elemwise_mul");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  if (node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+  return node;
+});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
new file mode 100644
index 000000000000..1469395ec169
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_elemwisemul_post_quantize_property.h
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_elemwisemul_post_quantize_property.cc
+ * \brief Partition gragph property for MKLDNN Quantized ElemwiseMul operator
+ * \author Xinyu Chen
+*/
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
+#if MXNET_USE_MKLDNN == 1
+
+#include <string>
+#include <vector>
+#include "../../tensor/elemwise_binary_op-inl.h"
+#include "../../quantization/requantize-inl.h"
+#include "../common.h"
+#include "mkldnn_subgraph_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#define QUANTIZED_ElemwiseMul_NAME "_contrib_quantized_elemwise_mul"
+
+class ElemwiseMulPostQuantizeSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kRequantize,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  bool disable_float_output;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit ElemwiseMulPostQuantizeSelector(const bool dis_all,
+                                           const bool dis_float_output)
+      : disable_all(dis_all),
+        disable_float_output(dis_float_output) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() == Op::Get(QUANTIZED_ElemwiseMul_NAME)) {
+      status = disable_all ? kSuccess : kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+        matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
+      }
+
+      status = kSuccess;
+      return false;
+    }
+
+    switch (status) {
+      case kStart:
+        if (new_node.op() == Op::Get("_contrib_requantize")) {
+          auto const &param = nnvm::get<RequantizeParam>(new_node.attrs.parsed);
+          if (param.min_calib_range.has_value() &&
+              param.max_calib_range.has_value()) {
+            matched_list.push_back(&new_node);
+            status = kRequantize;
+            return true;
+          }
+        }
+      case kRequantize:
+        if ((!disable_float_output) && (new_node.op() == Op::Get("_contrib_dequantize"))) {
+            matched_list.push_back(&new_node);
+            status = kSuccess;
+            return true;
+        }
+      default:
+        status = kSuccess;
+        return false;
+    }
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if ((status != kSuccess) || (matched_list.size() <= 1)) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return ret;
+    }
+  }
+
+  void Reset() override {
+    CHECK_GE(matched_list.size(), 1);
+    auto new_selector = ElemwiseMulPostQuantizeSelector(disable_all, disable_float_output);
+    new_selector.Select(*matched_list[0]);
+    *this = new_selector;
+  }
+};
+
+class ElemwiseMulPostQuantizeProperty : public SubgraphProperty {
+ public:
+  ElemwiseMulPostQuantizeProperty() {
+    disable_fuse_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QEM_FUSE_ALL", false);
+    disable_float_output = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_QEM_FLOAT_OUTPUT", false);
+  }
+
+  static SubgraphPropertyPtr Create() {
+    static const std::string &name = "MKLDNN EltwiseMul post-quantization optimization pass";
+    auto property = std::make_shared<ElemwiseMulPostQuantizeProperty>();
+    property->SetAttr<std::string>("property_name", name);
+    property->SetAttr<bool>("inference_only", true);
+    return property;
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr em_node = nullptr;
+    nnvm::NodePtr requantize_node = nullptr;
+    nnvm::NodePtr dequantize_node = nullptr;
+
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      if (node->op() == Op::Get(QUANTIZED_ElemwiseMul_NAME)) {
+        em_node = node;
+      } else if (node->op() == Op::Get("_contrib_requantize")) {
+        requantize_node = node;
+      } else if (node->op() == Op::Get("_contrib_dequantize")) {
+        dequantize_node = node;
+      }
+    });
+
+    CHECK_NOTNULL(em_node);
+    CHECK_NOTNULL(requantize_node);
+    auto const &requantize_param =
+        nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
+    CHECK(requantize_param.min_calib_range.has_value());
+    CHECK(requantize_param.max_calib_range.has_value());
+
+    // When only fused quantized_elemwise_mul and requantize, set min/max_cablib_range,
+    // When fused quantized_elemwise_mul + requantize + dequantize, set dequantize flag to true.
+    if (dequantize_node != nullptr) {
+      em_node->attrs.dict["enable_float_output"] = "True";
+    } else {
+      em_node->attrs.dict["min_calib_range"] =
+          std::to_string(requantize_param.min_calib_range.value());
+      em_node->attrs.dict["max_calib_range"] =
+          std::to_string(requantize_param.max_calib_range.value());
+    }
+    em_node->op()->attr_parser(&(em_node->attrs));
+    return em_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<ElemwiseMulPostQuantizeSelector>(disable_fuse_all,
+                                                          disable_float_output);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  bool disable_fuse_all;
+  bool disable_float_output;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_ELEMWISEMUL_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
index 269017ea6a03..18cd3031ef18 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -23,6 +23,7 @@
 #include "mkldnn_fc_property.h"
 #include "mkldnn_post_quantize_property.h"
 #include "mkldnn_fc_post_quantize_property.h"
+#include "mkldnn_elemwisemul_post_quantize_property.h"
 #include "mkldnn_post_quantize_align_scale_property.h"
 
 namespace mxnet {
@@ -57,6 +58,7 @@ MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeProperty);
 
 #if MXNET_USE_MKLDNN == 1
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, ElemwiseMulPostQuantizeProperty);
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 527737e03cd7..0c40f32d1666 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -341,6 +341,66 @@ def check_quantized_elemwise_add(data_shape, qtype):
         check_quantized_elemwise_add((3, 4, 56, 56), qtype)
         check_quantized_elemwise_add((32, 56, 64, 11), qtype)
 
+@with_seed()
+def test_quantized_elemwise_mul():
+    def check_quantized_elemwise_mul(data_shape, qtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_elemwise_mul for native cpu since it is not supported yet')
+            return
+        elif qtype != 'int8':
+            print('skipped testing quantized_elemwise_mul for not supported data type')
+            return
+        elif is_test_for_gpu():
+            print('skipped testing quantized_elemwise_mul for gpu since it is not supported yet')
+            return
+
+        dataA = mx.sym.Variable(name='dataA', shape=data_shape, dtype='float32')
+        dataB = mx.sym.Variable(name='dataB', shape=data_shape, dtype='float32')
+        elemwise_mul_fp32 = mx.sym.elemwise_mul(dataA, dataB)
+        arg_names = elemwise_mul_fp32.list_arguments()
+        elemwise_mul_fp32_exe = elemwise_mul_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qtype == 'uint8':
+            data_low = 0.0
+            data_high = 255.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+
+        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        elemwise_mul_fp32_exe.arg_dict[arg_names[0]][:] = dataA_val
+
+        elemwise_mul_fp32_exe.arg_dict[arg_names[1]][:] = dataB_val
+
+        output = elemwise_mul_fp32_exe.forward()[0]
+
+        qdataA = mx.sym.Variable(name='qdataA', shape=data_shape, dtype=qtype)
+        qdataB = mx.sym.Variable(name='qdataB', shape=data_shape, dtype=qtype)
+        min_dataA = mx.sym.Variable(name='min_dataA')
+        max_dataA = mx.sym.Variable(name='max_dataA')
+        min_dataB = mx.sym.Variable(name='min_dataB')
+        max_dataB = mx.sym.Variable(name='max_dataB')
+        quantized_elemwise_mul = mx.sym.contrib.quantized_elemwise_mul(qdataA, qdataB, min_dataA, max_dataA, min_dataB, max_dataB)
+        elemwise_mul_int8_exe = quantized_elemwise_mul.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_elemwise_mul.list_arguments()
+        elemwise_mul_int8_exe.arg_dict[qarg_names[0]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[0]].astype(qtype)
+        elemwise_mul_int8_exe.arg_dict[qarg_names[1]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[1]].astype(qtype)
+        quantized_range = 127.0
+        elemwise_mul_int8_exe.arg_dict[qarg_names[2]][:] = data_low
+        elemwise_mul_int8_exe.arg_dict[qarg_names[3]][:] = data_high
+        elemwise_mul_int8_exe.arg_dict[qarg_names[4]][:] = data_low
+        elemwise_mul_int8_exe.arg_dict[qarg_names[5]][:] = data_high
+        qoutput, min_range, max_range = elemwise_mul_int8_exe.forward()
+
+        fp32_rslt = output.asnumpy()
+        int8_rslt = qoutput.astype(output.dtype)
+        assert_almost_equal(fp32_rslt, int8_rslt, atol = 1e-4)
+
+    for qtype in ['int8', 'uint8']:
+        check_quantized_elemwise_mul((4, 6), qtype)
+        check_quantized_elemwise_mul((13, 74, 52), qtype)
+        check_quantized_elemwise_mul((3, 4, 56, 56), qtype)
+        check_quantized_elemwise_mul((32, 56, 64, 11), qtype)
 
 @with_seed()
 def test_quantized_pooling():
@@ -1005,7 +1065,7 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
                     else:
                         excluded_sym_names = excluded_names + optional_names
             if name == 'sym4':
-                excluded_op_names += ['elemwise_add']
+                excluded_op_names += ['elemwise_add', 'elemwise_mul']
 
             qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
                                                                              arg_params=arg_params,

From d3eb0b49456736810978d75494214c4129e48494 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 5 Nov 2019 17:08:19 -0800
Subject: [PATCH 19/25] basic version that is verfied on CPU

---
 src/operator/mshadow_op.h     |  4 +--
 src/operator/nn/dropout-inl.h | 56 +++++++++++++++++++++++++++--------
 src/operator/nn/dropout.cc    | 20 +++++++++----
 3 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index e3a3c0443428..f59f6da4450d 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -538,8 +538,8 @@ MXNET_UNARY_MATH_OP(square, math::sqr(a));
 MXNET_UNARY_MATH_OP(square_grad, 2.0f * math::id(a));
 
 /*! \brief used for generate Bernoulli mask */
-MXNET_BINARY_MATH_OP_NC(threshold, a < b ? DType(1) : DType(0));
-MXNET_BINARY_MATH_OP_NC(threshold_eq, a <= b ? DType(1) : DType(0));
+MXNET_BINARY_LOGIC_OP_NC(threshold, a < b ? DType(1) : DType(0));
+MXNET_BINARY_LOGIC_OP_NC(threshold_eq, a <= b ? DType(1) : DType(0));
 
 /*! \brief used for generate element of abs */
 MXNET_UNARY_MATH_OP(abs, math::fabs(a)); // NOLINT(*)
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 1eff5cd8591d..25c74f712bcd 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -187,16 +187,46 @@ class DropoutOp {
                                     const index_t N,
                                     const index_t step,
                                     DType *dropout_out,
-                                    DType *mask_out,
+                                    uint8_t *mask_out,
                                     const DType *input_data,
                                     const real_t pkeep) {
       RNG_KERNEL_LOOP(xpu, DType, id, gen, N, step, {
         const real_t rand_num = static_cast<real_t>(genImpl.uniform());
-        mask_out[i] = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep) * (1.0f / pkeep);
-        dropout_out[i] = input_data[i] * mask_out[i];
-      });
+        // mask_out is set per bit position
+        // therefore bitwise shift need to be performed here
+        auto maskIdx = i / 8;
+        auto maskOffset = i % 8;
+        bool maskVal = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
+        if (maskVal) {
+          // set bit
+          mask_out[maskIdx] |= 1U << maskOffset;
+        } else {
+          // clear bit
+          mask_out[maskIdx] &= ~(1U << maskOffset);
+        }
+
+        // TODO (lnyuan): seems we can set dropout to zero if maskVal is False
+        // however doing this would break one unit test when pkeep is 0, expecting nan
+        // not sure why
+        dropout_out[i] = maskVal * input_data[i] * (1.0f / pkeep);
+      })
     }
   };
+
+  struct DropoutBackwardKernel {
+    MSHADOW_XINLINE static void Map(index_t i,
+                                    OpReqType req,
+                                    DType *igrad,
+                                    DType *ograd,
+                                    const uint8_t *mask,
+                                    const real_t pkeep) {
+      auto maskIdx = i / 8;
+      uint8_t maskOffset = i % 8;
+      bool maskVal = (mask[maskIdx] >> maskOffset) & 1U;
+      KERNEL_ASSIGN(igrad[i], req, maskVal * ograd[i] * (1 / pkeep));
+    }
+  };
+
   struct BernoulliKernel {
     /*! \brief Bernoulli kernel for generating mask */
     MSHADOW_XINLINE static void Map(index_t id,
@@ -282,7 +312,7 @@ class DropoutOp {
       CUDNN_CALL(cudnnDropoutGetReserveSpaceSize(x_desc_, &dropout_reserve_byte_));
       // cudnn uses bits to record the positions that are dropped, so reserve bytes is always
       // 1/8 of input size.
-      CHECK_GE(mask.Size() * sizeof(DType), dropout_reserve_byte_) <<
+      CHECK_GE(mask.Size() * sizeof(uint8_t), dropout_reserve_byte_) <<
         "The size of the mask space is smaller than the required cudnn reserved space.";
       CUDNN_CALL(cudnnDropoutForward(s->dnn_handle_,
                                      dropout_desc_,
@@ -290,7 +320,7 @@ class DropoutOp {
                                      in.dptr<DType>(),
                                      y_desc_,
                                      out.dptr<DType>(),
-                                     mask.dptr<DType>(),
+                                     mask.dptr<uint8_t>(),
                                      dropout_reserve_byte_));
   }
 
@@ -328,7 +358,7 @@ class DropoutOp {
                                       out_grad.dptr<DType>(),
                                       dx_desc_,
                                       in_grad.dptr<DType>(),
-                                      mask.dptr<DType>(),
+                                      mask.dptr<uint8_t>(),
                                       dropout_reserve_byte_));
   }
 #endif  // MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
@@ -367,7 +397,7 @@ class DropoutOp {
           CHECK(req[dropout::kOut] != kAddTo);
           LaunchRNG<DropoutKernel, xpu>(s, pgen, out.Size(),
                                         out.dptr<DType>(),
-                                        mask.dptr<DType>(),
+                                        mask.dptr<uint8_t>(),
                                         in.dptr<DType>(),
                                         this->pkeep_);
           return;
@@ -426,6 +456,7 @@ class DropoutOp {
       const TBlob &gdata = in_grad[dropout::kData];
       const TBlob &grad = out_grad[dropout::kOut];
       const TBlob &mask = out_data[dropout::kMask];
+
       if (this->axes_.ndim() == 0) {
 #if MXNET_USE_MKL_DROPOUT
         if (MKLAvailable()) {
@@ -440,11 +471,12 @@ class DropoutOp {
         }
 #endif  // MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
         // standard case for dropout
-        CHECK_EQ(grad.Size(), mask.Size());
+        CHECK_LE(grad.Size(), mask.Size() * 8);
+
         MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
-            s, gdata.Size(), gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<DType>());
-        });
+          mxnet_op::Kernel<DropoutBackwardKernel, xpu>::Launch(
+            s, gdata.Size(), Req, gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<uint8_t>(), pkeep_);
+        })
         return;
       } else {
         // broardcast mul
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 745bba142b6e..46a3c16a1a91 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -118,14 +118,24 @@ Example::
   if (!mxnet::ndim_is_known(dshape)) return false;
   out_shape->clear();
   out_shape->push_back(dshape);
-  for (int i = 0; i < param.axes.ndim(); ++i) {
-    dshape[param.axes[i]] = 1;
+  if (param.axes.ndim() > 0) {
+    // TODO (lnyuan): support specifying axes
+    LOG(FATAL) << "not supported yet";
+    /*
+    for (int i = 0; i < param.axes.ndim(); ++i) {
+      dshape[param.axes[i]] = 1;
+    }
+    out_shape->push_back(dshape); */
+  } else {
+    mxnet::TShape mshape(1, static_cast<dim_t>(ceil(static_cast<double>(dshape.Size()) / 8)));
+    out_shape->push_back(mshape);
   }
-  out_shape->push_back(dshape);
+
   return true;
 })
 .set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
       std::vector<int> *in_type, std::vector<int> *out_type) {
+  using namespace mshadow;
   CHECK_EQ(in_type->size(), 1U);
   int dtype = in_type->at(0);
 
@@ -134,9 +144,9 @@ Example::
     return false;
   }
 
-  size_t nout = 2;
   out_type->clear();
-  for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+  out_type->push_back(dtype); // data type for output
+  out_type->push_back(kUint8); // data type for mask
   return true;
 })
 .set_attr<FCreateOpState>("FCreateOpState", CreateDropoutState)

From 9d10a17c495d755666da7abf117e33f28df7d689 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 5 Nov 2019 22:38:53 -0800
Subject: [PATCH 20/25] add log message and TODO

---
 src/operator/nn/dropout-inl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 25c74f712bcd..753dee4b32d5 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -402,6 +402,8 @@ class DropoutOp {
                                         this->pkeep_);
           return;
         } else {
+          // TODO (lnyuan) : support axes param
+          LOG(FATAL) << "param axes is not yet supported in this PR";
           RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
           CHECK_NOTNULL(pgen);
           // initialize the mask

From cd01a4a73303732d2347ed85cd2b0a5c4d2dad6a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Sun, 22 Dec 2019 01:57:58 -0800
Subject: [PATCH 21/25] add backward support for 1-bit mask

---
 src/operator/nn/dropout-inl.h | 101 ++++++++++++++++++++++++++--------
 src/operator/nn/dropout.cc    |  12 +---
 2 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 753dee4b32d5..3d7db29eef00 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -79,9 +79,10 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
     .set_default(dropout::kTraining)
     .describe("Whether to only turn on dropout during training or to also turn on for inference.");
     DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape(0, 0))
-    .describe("Axes for variational dropout kernel.");
+    .describe("Axes for variational dropout kernel. Same dropout will be applied to elements "
+              "along the specified axis.");
     DMLC_DECLARE_FIELD(cudnn_off).set_default(dmlc::optional<bool>(false))
-    .describe("Whether to turn off cudnn in dropout operator. "
+    .describe("Whether to turn off cuDNN in dropout operator. "
               "This option is ignored if axes is specified.");
   }
 };  // struct DropoutParam
@@ -233,12 +234,55 @@ class DropoutOp {
                                     RandGenerator<xpu, DType> gen,
                                     const index_t N,
                                     const index_t step,
-                                    DType *mask_out,
+                                    DType *dropout_out,
+                                    uint8_t *mask_out,
                                     const real_t pkeep) {
       RNG_KERNEL_LOOP(xpu, DType, id, gen, N, step, {
         const real_t rand_num = static_cast<real_t>(genImpl.uniform());
-        mask_out[i] = mshadow_op::threshold::Map<real_t>(rand_num, pkeep) * (1.0f / pkeep);
-      });
+        // mask_out is set per bit position
+        // therefore bitwise shift need to be performed here
+        auto maskIdx = i / 8;
+        auto maskOffset = i % 8;
+        bool maskVal = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
+        if (maskVal) {
+          // set bit
+          mask_out[maskIdx] |= 1U << maskOffset;
+        } else {
+          // clear bit
+          mask_out[maskIdx] &= ~(1U << maskOffset);
+        }
+        dropout_out[i] = maskVal * (1.0 / pkeep);
+      })
+    }
+  };
+
+  template<int ndim>
+  struct BernoulliBackwardKernel {
+    MSHADOW_XINLINE static void Map(index_t base,
+                                    index_t length,
+                                    OpReqType req,
+                                    const Shape<ndim> &lstride,
+                                    const Shape<ndim> &rstride,
+                                    const Shape<ndim> &oshape,
+                                    DType *igrad,
+                                    DType *ograd,
+                                    const uint8_t *mask,
+                                    const real_t pkeep) {
+      Shape <ndim> coord = unravel(base, oshape);
+      auto lidx = static_cast<index_t>(dot(coord, lstride));
+      auto ridx = static_cast<index_t>(dot(coord, rstride));
+      auto maskIdx = ridx / 8;
+      uint8_t maskOffset = ridx % 8;
+      bool maskVal = (mask[maskIdx] >> maskOffset) & 1U;
+      KERNEL_ASSIGN(igrad[base], req, maskVal * ograd[lidx] * (1 / pkeep))
+      // starts from 1 to avoid extra inc at end of loop
+      for (index_t i = 1; i < length; ++i) {
+        inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+        maskIdx = ridx / 8;
+        maskOffset = ridx % 8;
+        maskVal = (mask[maskIdx] >> maskOffset) & 1U;
+        KERNEL_ASSIGN(igrad[base + i], req, maskVal * ograd[lidx] * (1 / pkeep))
+      }
     }
   };
 
@@ -402,24 +446,30 @@ class DropoutOp {
                                         this->pkeep_);
           return;
         } else {
-          // TODO (lnyuan) : support axes param
-          LOG(FATAL) << "param axes is not yet supported in this PR";
+          // allocating temp buffer to store masked output
+          TShape temp_shape = out.shape_;
+          for (int i = 0; i < this->axes_.ndim(); ++i) {
+            temp_shape[this->axes_[i]] = 1;
+          }
+          Tensor<xpu, 1, DType> temp =
+              ctx.requested[1].get_space_typed<xpu, 1, DType>(Shape1(temp_shape.Size()), s);
           RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
           CHECK_NOTNULL(pgen);
           // initialize the mask
-          LaunchRNG<BernoulliKernel, xpu>(s, pgen, mask.Size(),
-                                          mask.dptr<DType>(),
+          LaunchRNG<BernoulliKernel, xpu>(s, pgen, temp_shape.Size(),
+                                          temp.dptr_,
+                                          mask.dptr<uint8_t>(),
                                           this->pkeep_);
           // broadcast mul
-          mxnet::TShape new_lshape, new_rshape, new_oshape;
+          TShape new_lshape, new_rshape, new_oshape;
           int ndim = BinaryBroadcastShapeCompact(in.shape_,
-                                                 mask.shape_, out.shape_,
+                                                 temp_shape, out.shape_,
                                                  &new_lshape, &new_rshape, &new_oshape);
           if (!ndim) {
             MXNET_ASSIGN_REQ_SWITCH(req[dropout::kOut], Req, {
               mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
                 s, out.Size(), out.dptr<DType>(), in.dptr<DType>(),
-                mask.dptr<DType>());
+                temp.dptr_);
             });
           } else {
             BROADCAST_NDIM_SWITCH(ndim, NDim, {
@@ -428,10 +478,9 @@ class DropoutOp {
               mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
               mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::mul>, xpu>::
               template LaunchEx(s, new_oshape.Size(), req[dropout::kOut],
-              lstride, rstride, oshape,
-              in.dptr<DType>(),
-              mask.dptr<DType>(), out.dptr<DType>());
-            });
+                                lstride, rstride, oshape, in.dptr<DType>(),
+                                temp.dptr_, out.dptr<DType>());
+            })
           }
         }
       } else {
@@ -477,28 +526,34 @@ class DropoutOp {
 
         MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
           mxnet_op::Kernel<DropoutBackwardKernel, xpu>::Launch(
-            s, gdata.Size(), Req, gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<uint8_t>(), pkeep_);
+              s, gdata.Size(), Req, gdata.dptr<DType>(), grad.dptr<DType>(),
+              mask.dptr<uint8_t>(), pkeep_);
         })
         return;
       } else {
+        TShape temp_shape = grad.shape_;
+        for (int i = 0; i < this->axes_.ndim(); ++i) {
+          temp_shape[this->axes_[i]] = 1;
+        }
         // broardcast mul
-        mxnet::TShape new_lshape, new_rshape, new_oshape;
+        TShape new_lshape, new_rshape, new_oshape;
         int ndim = BinaryBroadcastShapeCompact(grad.shape_,
-                                               mask.shape_, gdata.shape_,
+                                               temp_shape, gdata.shape_,
                                                &new_lshape, &new_rshape, &new_oshape);
         if (!ndim) {
           MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
-              s, gdata.Size(), gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<DType>());
+            mxnet_op::Kernel<DropoutBackwardKernel, xpu>::Launch(
+              s, gdata.Size(), Req, gdata.dptr<DType>(), grad.dptr<DType>(),
+              mask.dptr<uint8_t >(), pkeep_);
           });
         } else {
           BROADCAST_NDIM_SWITCH(ndim, NDim, {
             mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
             mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
             mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, mshadow_op::mul>, xpu>::
+            mxnet_op::Kernel<BernoulliBackwardKernel<NDim>, xpu>::
             template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-            grad.dptr<DType>(), mask.dptr<DType>(), gdata.dptr<DType>());
+            gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<uint8_t>(), pkeep_);
           });
         }
       }
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 46a3c16a1a91..711410f396fa 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -119,18 +119,12 @@ Example::
   out_shape->clear();
   out_shape->push_back(dshape);
   if (param.axes.ndim() > 0) {
-    // TODO (lnyuan): support specifying axes
-    LOG(FATAL) << "not supported yet";
-    /*
     for (int i = 0; i < param.axes.ndim(); ++i) {
       dshape[param.axes[i]] = 1;
     }
-    out_shape->push_back(dshape); */
-  } else {
-    mxnet::TShape mshape(1, static_cast<dim_t>(ceil(static_cast<double>(dshape.Size()) / 8)));
-    out_shape->push_back(mshape);
   }
-
+  mxnet::TShape mshape(1, static_cast<dim_t>(ceil(static_cast<double>(dshape.Size()) / 8)));
+  out_shape->push_back(mshape);
   return true;
 })
 .set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
@@ -172,9 +166,7 @@ Example::
 #endif
     }
     request.emplace_back(ResourceRequest::kParallelRandom);
-#if MXNET_USE_MKL_DROPOUT
     request.emplace_back(ResourceRequest::kTempSpace);
-#endif
     return request;
   })
 .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.")

From 6fbf760e28c1fd2f2d3a1dffb68a481d708d9c29 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 23 Dec 2019 01:49:03 -0800
Subject: [PATCH 22/25] fix the race condition in LaunchRNG

---
 src/operator/nn/dropout-inl.h          | 59 ++++++++++++--------------
 src/operator/nn/dropout.cc             |  7 ++-
 src/operator/random/sampler.h          |  3 +-
 tests/python/unittest/test_operator.py |  2 +-
 4 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 3d7db29eef00..9b8ef412df61 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -179,7 +179,7 @@ class DropoutOp {
      * \param N Total number of items in the output
      * \param step Step between items, related to parallelism
      * \param dropout_out Output dropout values
-     * \param mask_out  Output mask (is multiplied to create dropout output, may be 0)
+     * \param mask_out  Output mask with one bit for one element
      * \param input_data Input data to perform the dropout on
      * \param pkeep Dropout rate (keep when the generated random number is less than this value)
      */
@@ -191,25 +191,22 @@ class DropoutOp {
                                     uint8_t *mask_out,
                                     const DType *input_data,
                                     const real_t pkeep) {
+      CHECK_EQ(step & 7, 0);
       RNG_KERNEL_LOOP(xpu, DType, id, gen, N, step, {
         const real_t rand_num = static_cast<real_t>(genImpl.uniform());
         // mask_out is set per bit position
         // therefore bitwise shift need to be performed here
-        auto maskIdx = i / 8;
-        auto maskOffset = i % 8;
-        bool maskVal = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
-        if (maskVal) {
+        auto mask_idx = i / 8;
+        auto mask_offset = i % 8;
+        bool mask_val = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
+        if (mask_val) {
           // set bit
-          mask_out[maskIdx] |= 1U << maskOffset;
+          mask_out[mask_idx] |= 1U << mask_offset;
         } else {
           // clear bit
-          mask_out[maskIdx] &= ~(1U << maskOffset);
+          mask_out[mask_idx] &= ~(1U << mask_offset);
         }
-
-        // TODO (lnyuan): seems we can set dropout to zero if maskVal is False
-        // however doing this would break one unit test when pkeep is 0, expecting nan
-        // not sure why
-        dropout_out[i] = maskVal * input_data[i] * (1.0f / pkeep);
+        dropout_out[i] = mask_val * input_data[i] * (1.0f / pkeep);
       })
     }
   };
@@ -221,10 +218,10 @@ class DropoutOp {
                                     DType *ograd,
                                     const uint8_t *mask,
                                     const real_t pkeep) {
-      auto maskIdx = i / 8;
-      uint8_t maskOffset = i % 8;
-      bool maskVal = (mask[maskIdx] >> maskOffset) & 1U;
-      KERNEL_ASSIGN(igrad[i], req, maskVal * ograd[i] * (1 / pkeep));
+      auto mask_idx = i / 8;
+      uint8_t mask_offset = i % 8;
+      bool mask_val = (mask[mask_idx] >> mask_offset) & 1U;
+      KERNEL_ASSIGN(igrad[i], req, mask_val * ograd[i] * (1 / pkeep));
     }
   };
 
@@ -241,17 +238,17 @@ class DropoutOp {
         const real_t rand_num = static_cast<real_t>(genImpl.uniform());
         // mask_out is set per bit position
         // therefore bitwise shift need to be performed here
-        auto maskIdx = i / 8;
-        auto maskOffset = i % 8;
-        bool maskVal = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
-        if (maskVal) {
+        auto mask_idx = i / 8;
+        auto mask_offset = i % 8;
+        bool mask_val = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
+        if (mask_val) {
           // set bit
-          mask_out[maskIdx] |= 1U << maskOffset;
+          mask_out[mask_idx] |= 1U << mask_offset;
         } else {
           // clear bit
-          mask_out[maskIdx] &= ~(1U << maskOffset);
+          mask_out[mask_idx] &= ~(1U << mask_offset);
         }
-        dropout_out[i] = maskVal * (1.0 / pkeep);
+        dropout_out[i] = mask_val * (1.0 / pkeep);
       })
     }
   };
@@ -271,17 +268,17 @@ class DropoutOp {
       Shape <ndim> coord = unravel(base, oshape);
       auto lidx = static_cast<index_t>(dot(coord, lstride));
       auto ridx = static_cast<index_t>(dot(coord, rstride));
-      auto maskIdx = ridx / 8;
-      uint8_t maskOffset = ridx % 8;
-      bool maskVal = (mask[maskIdx] >> maskOffset) & 1U;
-      KERNEL_ASSIGN(igrad[base], req, maskVal * ograd[lidx] * (1 / pkeep))
+      auto mask_idx = ridx / 8;
+      uint8_t mask_offset = ridx % 8;
+      bool mask_val = (mask[mask_idx] >> mask_offset) & 1U;
+      KERNEL_ASSIGN(igrad[base], req, mask_val * ograd[lidx] * (1 / pkeep))
       // starts from 1 to avoid extra inc at end of loop
       for (index_t i = 1; i < length; ++i) {
         inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
-        maskIdx = ridx / 8;
-        maskOffset = ridx % 8;
-        maskVal = (mask[maskIdx] >> maskOffset) & 1U;
-        KERNEL_ASSIGN(igrad[base + i], req, maskVal * ograd[lidx] * (1 / pkeep))
+        mask_idx = ridx / 8;
+        mask_offset = ridx % 8;
+        mask_val = (mask[mask_idx] >> mask_offset) & 1U;
+        KERNEL_ASSIGN(igrad[base + i], req, mask_val * ograd[lidx] * (1 / pkeep))
       }
     }
   };
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 711410f396fa..3e3d806558bb 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -111,10 +111,9 @@ Example::
 })
 .set_attr<mxnet::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
       mxnet::ShapeVector *in_shape, mxnet::ShapeVector *out_shape){
-  using namespace mshadow;
   CHECK_EQ(in_shape->size(), 1U);
   const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
-  mxnet::TShape dshape(in_shape->at(0));
+  TShape dshape(in_shape->at(0));
   if (!mxnet::ndim_is_known(dshape)) return false;
   out_shape->clear();
   out_shape->push_back(dshape);
@@ -123,13 +122,13 @@ Example::
       dshape[param.axes[i]] = 1;
     }
   }
-  mxnet::TShape mshape(1, static_cast<dim_t>(ceil(static_cast<double>(dshape.Size()) / 8)));
+  // Use 1-bit in mask by rounding up dshape.Size() / 8
+  TShape mshape(1, static_cast<dim_t>((dshape.Size() + 7) / 8));
   out_shape->push_back(mshape);
   return true;
 })
 .set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
       std::vector<int> *in_type, std::vector<int> *out_type) {
-  using namespace mshadow;
   CHECK_EQ(in_type->size(), 1U);
   int dtype = in_type->at(0);
 
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index 1a9bf7a4d169..2591dc51171d 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -53,7 +53,8 @@ inline static void LaunchRNG(mshadow::Stream<xpu> *s,
                     RandGenerator<xpu>::kMinNumRandomPerThread;
   const index_t nthread = std::min(nloop,
                                    static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
-  const index_t step = (N + nthread - 1) / nthread;
+  const index_t step = ((N + nthread - 1) / nthread + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+      RandGenerator<xpu>::kMinNumRandomPerThread * RandGenerator<xpu>::kMinNumRandomPerThread;
   Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
 }
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d59c3063f95a..923ede44c340 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6955,7 +6955,7 @@ def test_stack():
 
 
 @with_seed()
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/14288")
+#@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/14288")
 def test_dropout():
     def zero_count(array, ratio):
         zeros = 0

From fc86989c7a0f3a84326ebd1d7abb5f905472df7e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 23 Dec 2019 15:43:53 -0800
Subject: [PATCH 23/25] refactoring to improve readability

---
 src/operator/nn/dropout-inl.h |  8 ++++----
 src/operator/random/sampler.h | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 9b8ef412df61..600832a86abe 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -197,7 +197,7 @@ class DropoutOp {
         // mask_out is set per bit position
         // therefore bitwise shift need to be performed here
         auto mask_idx = i / 8;
-        auto mask_offset = i % 8;
+        uint8_t mask_offset = i % 8;
         bool mask_val = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
         if (mask_val) {
           // set bit
@@ -239,7 +239,7 @@ class DropoutOp {
         // mask_out is set per bit position
         // therefore bitwise shift need to be performed here
         auto mask_idx = i / 8;
-        auto mask_offset = i % 8;
+        uint8_t mask_offset = i % 8;
         bool mask_val = mshadow_op::threshold_eq::Map<real_t>(rand_num, pkeep);
         if (mask_val) {
           // set bit
@@ -477,7 +477,7 @@ class DropoutOp {
               template LaunchEx(s, new_oshape.Size(), req[dropout::kOut],
                                 lstride, rstride, oshape, in.dptr<DType>(),
                                 temp.dptr_, out.dptr<DType>());
-            })
+            });
           }
         }
       } else {
@@ -519,7 +519,7 @@ class DropoutOp {
         }
 #endif  // MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
         // standard case for dropout
-        CHECK_LE(grad.Size(), mask.Size() * 8);
+        CHECK_EQ((grad.Size() + 7) / 8, mask.Size());
 
         MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
           mxnet_op::Kernel<DropoutBackwardKernel, xpu>::Launch(
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index 2591dc51171d..8ef5e2bffe1a 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -49,13 +49,13 @@ inline static void LaunchRNG(mshadow::Stream<xpu> *s,
   if (N <= 0) {
     return;
   }
-  const index_t nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
-                    RandGenerator<xpu>::kMinNumRandomPerThread;
-  const index_t nthread = std::min(nloop,
-                                   static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
-  const index_t step = ((N + nthread - 1) / nthread + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+  int num_threads = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+      RandGenerator<xpu>::kMinNumRandomPerThread;
+  num_threads = std::min(num_threads, RandGenerator<xpu>::kNumRandomStates);
+  const index_t num_steps_per_thread =
+      ((N + num_threads - 1) / num_threads + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
       RandGenerator<xpu>::kMinNumRandomPerThread * RandGenerator<xpu>::kMinNumRandomPerThread;
-  Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
+  Kernel<OP, xpu>::Launch(s, num_threads, *gen, N, num_steps_per_thread, args...);
 }
 
 #define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...)        \

From 4dc98c7b1dfc2c5be7095cffdb2da1855ed9995e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 26 Dec 2019 12:14:03 -0800
Subject: [PATCH 24/25] address reviewer comment and test w/o cudnn

---
 src/operator/nn/dropout-inl.h          | 26 +++++++-------
 src/operator/random/sampler.h          | 31 +++++++++++++++--
 tests/python/unittest/test_operator.py | 47 +++++++++++++-------------
 3 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 600832a86abe..6464b6ae8844 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -220,7 +220,7 @@ class DropoutOp {
                                     const real_t pkeep) {
       auto mask_idx = i / 8;
       uint8_t mask_offset = i % 8;
-      bool mask_val = (mask[mask_idx] >> mask_offset) & 1U;
+      bool mask_val = mask[mask_idx] & (1U << mask_offset);
       KERNEL_ASSIGN(igrad[i], req, mask_val * ograd[i] * (1 / pkeep));
     }
   };
@@ -270,14 +270,14 @@ class DropoutOp {
       auto ridx = static_cast<index_t>(dot(coord, rstride));
       auto mask_idx = ridx / 8;
       uint8_t mask_offset = ridx % 8;
-      bool mask_val = (mask[mask_idx] >> mask_offset) & 1U;
+      bool mask_val = mask[mask_idx] & (1U << mask_offset);
       KERNEL_ASSIGN(igrad[base], req, mask_val * ograd[lidx] * (1 / pkeep))
       // starts from 1 to avoid extra inc at end of loop
       for (index_t i = 1; i < length; ++i) {
         inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
         mask_idx = ridx / 8;
         mask_offset = ridx % 8;
-        mask_val = (mask[mask_idx] >> mask_offset) & 1U;
+        mask_val = mask[mask_idx] & (1U << mask_offset);
         KERNEL_ASSIGN(igrad[base + i], req, mask_val * ograd[lidx] * (1 / pkeep))
       }
     }
@@ -436,11 +436,12 @@ class DropoutOp {
           RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
           CHECK_NOTNULL(pgen);
           CHECK(req[dropout::kOut] != kAddTo);
-          LaunchRNG<DropoutKernel, xpu>(s, pgen, out.Size(),
-                                        out.dptr<DType>(),
-                                        mask.dptr<uint8_t>(),
-                                        in.dptr<DType>(),
-                                        this->pkeep_);
+          // Use batch size 8 to avoid race condition on mask
+          LaunchRNGBatch<DropoutKernel, xpu>(s, pgen, out.Size(), 8 /* batch_size */,
+                                             out.dptr<DType>(),
+                                             mask.dptr<uint8_t>(),
+                                             in.dptr<DType>(),
+                                             this->pkeep_);
           return;
         } else {
           // allocating temp buffer to store masked output
@@ -453,10 +454,11 @@ class DropoutOp {
           RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
           CHECK_NOTNULL(pgen);
           // initialize the mask
-          LaunchRNG<BernoulliKernel, xpu>(s, pgen, temp_shape.Size(),
-                                          temp.dptr_,
-                                          mask.dptr<uint8_t>(),
-                                          this->pkeep_);
+          // Use batch size 8 to avoid race condition on mask
+          LaunchRNGBatch<BernoulliKernel, xpu>(s, pgen, temp_shape.Size(), 8 /* batch_size */,
+                                               temp.dptr_,
+                                               mask.dptr<uint8_t>(),
+                                               this->pkeep_);
           // broadcast mul
           TShape new_lshape, new_rshape, new_oshape;
           int ndim = BinaryBroadcastShapeCompact(in.shape_,
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index 8ef5e2bffe1a..60313024d907 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -52,9 +52,34 @@ inline static void LaunchRNG(mshadow::Stream<xpu> *s,
   int num_threads = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
       RandGenerator<xpu>::kMinNumRandomPerThread;
   num_threads = std::min(num_threads, RandGenerator<xpu>::kNumRandomStates);
-  const index_t num_steps_per_thread =
-      ((N + num_threads - 1) / num_threads + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
-      RandGenerator<xpu>::kMinNumRandomPerThread * RandGenerator<xpu>::kMinNumRandomPerThread;
+  index_t num_steps_per_thread = std::max((N + num_threads - 1) / num_threads,
+      RandGenerator<xpu>::kMinNumRandomPerThread);
+  Kernel<OP, xpu>::Launch(s, num_threads, *gen, N, num_steps_per_thread, args...);
+}
+
+/*!
+ * \brief Launch a generic kernel with parallel random generator.
+ * Each thread will perform a batch of iterations sequentially.
+ * \tparam gen random generator
+ * \tparam N Number of iterations
+ * \tparam batch_size number of iterations to be performed in a batch per thread
+ * \tparam Args Varargs type to eventually pass to the OP::Map() function
+ */
+template<typename OP, typename xpu, typename GType, typename ...Args>
+inline static void LaunchRNGBatch(mshadow::Stream<xpu> *s,
+                                  common::random::RandGenerator<xpu, GType> *gen,
+                                  const index_t N, const int batch_size, Args... args) {
+  // minimal check to avoid division by zero, below.
+  // if `N` is zero the map operation is a no-op in any case.
+  if (N <= 0) {
+    return;
+  }
+  int num_threads = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+      RandGenerator<xpu>::kMinNumRandomPerThread;
+  num_threads = std::min(num_threads, RandGenerator<xpu>::kNumRandomStates);
+  index_t num_steps_per_thread = std::max((N + num_threads - 1) / num_threads,
+      RandGenerator<xpu>::kMinNumRandomPerThread);
+  num_steps_per_thread = (num_steps_per_thread + batch_size - 1) / batch_size * batch_size;
   Kernel<OP, xpu>::Launch(s, num_threads, *gen, N, num_steps_per_thread, args...);
 }
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 923ede44c340..5829fb6678a9 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6954,8 +6954,9 @@ def test_stack():
         check_numeric_gradient(out, inputs)
 
 
+# TODO (lnyuan): Temporarily disable cudnn in tests due to flaky test issue
+# https://github.com/apache/incubator-mxnet/issues/14288
 @with_seed()
-#@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/14288")
 def test_dropout():
     def zero_count(array, ratio):
         zeros = 0
@@ -7077,39 +7078,40 @@ def check_passthrough(ratio, shape, cudnn_off=True):
         assert_almost_equal(a.grad.asnumpy(), mx.nd.ones_like(b).asnumpy())
 
     shape = (100, 100)
-    check_dropout_ratio(0.5, shape)
-    check_dropout_ratio(0.0, shape)
-    check_dropout_ratio(1.0, shape)
-    check_dropout_ratio(0.75, shape)
-    check_dropout_ratio(0.25, shape)
+
+    #check_dropout_ratio(0.5, shape)
+    #check_dropout_ratio(0.0, shape)
+    #check_dropout_ratio(1.0, shape)
+    #check_dropout_ratio(0.75, shape)
+    #check_dropout_ratio(0.25, shape)
     check_dropout_ratio(0.5, shape, cudnn_off=False)
     check_dropout_ratio(0.0, shape, cudnn_off=False)
     check_dropout_ratio(1.0, shape, cudnn_off=False)
     check_dropout_ratio(0.75, shape, cudnn_off=False)
     check_dropout_ratio(0.25, shape, cudnn_off=False)
 
-    check_passthrough(0.5, shape)
-    check_passthrough(0.0, shape)
-    check_passthrough(1.0, shape)
+    #check_passthrough(0.5, shape)
+    #check_passthrough(0.0, shape)
+    #check_passthrough(1.0, shape)
     check_passthrough(0.5, shape, cudnn_off=False)
     check_passthrough(0.0, shape, cudnn_off=False)
     check_passthrough(1.0, shape, cudnn_off=False)
 
     nshape = (10, 10, 10, 10)
     with mx.autograd.train_mode():
-        check_dropout_axes(0.25, nshape, axes = (0,))
-        check_dropout_axes(0.25, nshape, axes = (1,))
-        check_dropout_axes(0.25, nshape, axes = (2,))
-        check_dropout_axes(0.25, nshape, axes = (3,))
-        check_dropout_axes(0.25, nshape, axes = (0, 1))
-        check_dropout_axes(0.25, nshape, axes = (0, 2))
-        check_dropout_axes(0.25, nshape, axes = (0, 3))
-        check_dropout_axes(0.25, nshape, axes = (1, 2))
-        check_dropout_axes(0.25, nshape, axes = (1, 3))
-        check_dropout_axes(0.25, nshape, axes = (2, 3))
-        check_dropout_axes(0.25, nshape, axes = (0, 1, 2))
-        check_dropout_axes(0.25, nshape, axes = (0, 2, 3))
-        check_dropout_axes(0.25, nshape, axes = (1, 2, 3))
+        #check_dropout_axes(0.25, nshape, axes = (0,))
+        #check_dropout_axes(0.25, nshape, axes = (1,))
+        #check_dropout_axes(0.25, nshape, axes = (2,))
+        #check_dropout_axes(0.25, nshape, axes = (3,))
+        #check_dropout_axes(0.25, nshape, axes = (0, 1))
+        #check_dropout_axes(0.25, nshape, axes = (0, 2))
+        #check_dropout_axes(0.25, nshape, axes = (0, 3))
+        #check_dropout_axes(0.25, nshape, axes = (1, 2))
+        #check_dropout_axes(0.25, nshape, axes = (1, 3))
+        #check_dropout_axes(0.25, nshape, axes = (2, 3))
+        #check_dropout_axes(0.25, nshape, axes = (0, 1, 2))
+        #check_dropout_axes(0.25, nshape, axes = (0, 2, 3))
+        #check_dropout_axes(0.25, nshape, axes = (1, 2, 3))
         check_dropout_axes(0.25, nshape, axes = (0,), cudnn_off=False)
         check_dropout_axes(0.25, nshape, axes = (1,), cudnn_off=False)
         check_dropout_axes(0.25, nshape, axes = (2,), cudnn_off=False)
@@ -7125,7 +7127,6 @@ def check_passthrough(ratio, shape, cudnn_off=True):
         check_dropout_axes(0.25, nshape, axes = (1, 2, 3), cudnn_off=False)
 
 
-
 @unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11290")
 @with_seed()
 def test_scatter_gather_nd():

From 78a40d52e67c3262ea7a5fc5afb2297c13139f88 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 26 Dec 2019 14:04:12 -0800
Subject: [PATCH 25/25] remove check from kernel

---
 src/operator/nn/dropout-inl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 6464b6ae8844..99a17f2133d3 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -191,7 +191,6 @@ class DropoutOp {
                                     uint8_t *mask_out,
                                     const DType *input_data,
                                     const real_t pkeep) {
-      CHECK_EQ(step & 7, 0);
       RNG_KERNEL_LOOP(xpu, DType, id, gen, N, step, {
         const real_t rand_num = static_cast<real_t>(genImpl.uniform());
         // mask_out is set per bit position